diff options
author | Tony Gutierrez <anthony.gutierrez@amd.com> | 2016-01-19 14:28:22 -0500 |
---|---|---|
committer | Tony Gutierrez <anthony.gutierrez@amd.com> | 2016-01-19 14:28:22 -0500 |
commit | 1a7d3f9fcb76a68540dd948f91413533a383bfde (patch) | |
tree | 867510a147cd095f19499d26b7c02d27de4cae9d /src/arch/hsail/insts | |
parent | 28e353e0403ea379d244a418e8dc8ee0b48187cf (diff) | |
download | gem5-1a7d3f9fcb76a68540dd948f91413533a383bfde.tar.xz |
gpu-compute: AMD's baseline GPU model
Diffstat (limited to 'src/arch/hsail/insts')
-rw-r--r-- | src/arch/hsail/insts/branch.cc | 86 | ||||
-rw-r--r-- | src/arch/hsail/insts/branch.hh | 442 | ||||
-rw-r--r-- | src/arch/hsail/insts/decl.hh | 1106 | ||||
-rw-r--r-- | src/arch/hsail/insts/gpu_static_inst.cc | 64 | ||||
-rw-r--r-- | src/arch/hsail/insts/gpu_static_inst.hh | 65 | ||||
-rw-r--r-- | src/arch/hsail/insts/main.cc | 208 | ||||
-rw-r--r-- | src/arch/hsail/insts/mem.cc | 139 | ||||
-rw-r--r-- | src/arch/hsail/insts/mem.hh | 1629 | ||||
-rw-r--r-- | src/arch/hsail/insts/mem_impl.hh | 660 | ||||
-rw-r--r-- | src/arch/hsail/insts/pseudo_inst.cc | 787 |
10 files changed, 5186 insertions, 0 deletions
diff --git a/src/arch/hsail/insts/branch.cc b/src/arch/hsail/insts/branch.cc new file mode 100644 index 000000000..d65279cc8 --- /dev/null +++ b/src/arch/hsail/insts/branch.cc @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Anthony Gutierrez + */ + +#include "arch/hsail/insts/branch.hh" + +#include "gpu-compute/hsail_code.hh" + +namespace HsailISA +{ + GPUStaticInst* + decodeBrn(const Brig::BrigInstBase *ib, const BrigObject *obj) + { + // Detect direct vs indirect branch by seeing whether we have a + // register operand. + unsigned op_offs = obj->getOperandPtr(ib->operands, 0); + const Brig::BrigOperand *reg = obj->getOperand(op_offs); + + if (reg->kind == Brig::BRIG_KIND_OPERAND_REGISTER) { + return new BrnIndirectInst(ib, obj); + } else { + return new BrnDirectInst(ib, obj); + } + } + + GPUStaticInst* + decodeCbr(const Brig::BrigInstBase *ib, const BrigObject *obj) + { + // Detect direct vs indirect branch by seeing whether we have a + // second register operand (after the condition). + unsigned op_offs = obj->getOperandPtr(ib->operands, 1); + const Brig::BrigOperand *reg = obj->getOperand(op_offs); + + if (reg->kind == Brig::BRIG_KIND_OPERAND_REGISTER) { + return new CbrIndirectInst(ib, obj); + } else { + return new CbrDirectInst(ib, obj); + } + } + + GPUStaticInst* + decodeBr(const Brig::BrigInstBase *ib, const BrigObject *obj) + { + // Detect direct vs indirect branch by seeing whether we have a + // second register operand (after the condition). + unsigned op_offs = obj->getOperandPtr(ib->operands, 1); + const Brig::BrigOperand *reg = obj->getOperand(op_offs); + + if (reg->kind == Brig::BRIG_KIND_OPERAND_REGISTER) { + return new BrIndirectInst(ib, obj); + } else { + return new BrDirectInst(ib, obj); + } + } +} // namespace HsailISA diff --git a/src/arch/hsail/insts/branch.hh b/src/arch/hsail/insts/branch.hh new file mode 100644 index 000000000..54ad9a042 --- /dev/null +++ b/src/arch/hsail/insts/branch.hh @@ -0,0 +1,442 @@ +/* + * Copyright (c) 2012-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Steve Reinhardt + */ + +#ifndef __ARCH_HSAIL_INSTS_BRANCH_HH__ +#define __ARCH_HSAIL_INSTS_BRANCH_HH__ + +#include "arch/hsail/insts/gpu_static_inst.hh" +#include "arch/hsail/operand.hh" +#include "gpu-compute/gpu_dyn_inst.hh" +#include "gpu-compute/wavefront.hh" + +namespace HsailISA +{ + + // The main difference between a direct branch and an indirect branch + // is whether the target is a register or a label, so we can share a + // lot of code if we template the base implementation on that type. + template<typename TargetType> + class BrnInstBase : public HsailGPUStaticInst + { + public: + void generateDisassembly(); + + Brig::BrigWidth8_t width; + TargetType target; + + BrnInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj) + : HsailGPUStaticInst(obj, "brn") + { + o_type = Enums::OT_BRANCH; + width = ((Brig::BrigInstBr*)ib)->width; + unsigned op_offs = obj->getOperandPtr(ib->operands, 0); + target.init(op_offs, obj); + o_type = Enums::OT_BRANCH; + } + + uint32_t getTargetPc() override { return target.getTarget(0, 0); } + + bool unconditionalJumpInstruction() override { return true; } + bool isVectorRegister(int operandIndex) { + assert(operandIndex >= 0 && operandIndex < getNumOperands()); + return target.isVectorRegister(); + } + bool isCondRegister(int operandIndex) { + assert(operandIndex >= 0 && operandIndex < getNumOperands()); + return target.isCondRegister(); + } + bool isScalarRegister(int operandIndex) { + assert(operandIndex >= 0 && operandIndex < getNumOperands()); + return target.isScalarRegister(); + } + + bool isSrcOperand(int operandIndex) { + assert(operandIndex >= 0 && operandIndex < getNumOperands()); + return true; + } + + bool isDstOperand(int operandIndex) { + return false; + } + + int getOperandSize(int operandIndex) { + assert(operandIndex >= 0 && operandIndex < getNumOperands()); + return target.opSize(); + } + + int getRegisterIndex(int operandIndex) { + assert(operandIndex >= 0 && operandIndex < getNumOperands()); + return target.regIndex(); + } + + int getNumOperands() { + return 1; + } + + void execute(GPUDynInstPtr gpuDynInst); + }; + + template<typename TargetType> + void + BrnInstBase<TargetType>::generateDisassembly() + { + std::string widthClause; + + if (width != 1) { + widthClause = csprintf("_width(%d)", width); + } + + disassembly = csprintf("%s%s %s", opcode, widthClause, + target.disassemble()); + } + + template<typename TargetType> + void + BrnInstBase<TargetType>::execute(GPUDynInstPtr gpuDynInst) + { + Wavefront *w = gpuDynInst->wavefront(); + + if (getTargetPc() == w->rpc()) { + w->popFromReconvergenceStack(); + } else { + // Rpc and execution mask remain the same + w->pc(getTargetPc()); + } + w->discardFetch(); + } + + class BrnDirectInst : public BrnInstBase<LabelOperand> + { + public: + BrnDirectInst(const Brig::BrigInstBase *ib, const BrigObject *obj) + : BrnInstBase<LabelOperand>(ib, obj) + { + } + int numSrcRegOperands() { return 0; } + int numDstRegOperands() { return 0; } + }; + + class BrnIndirectInst : public BrnInstBase<SRegOperand> + { + public: + BrnIndirectInst(const Brig::BrigInstBase *ib, const BrigObject *obj) + : BrnInstBase<SRegOperand>(ib, obj) + { + } + int numSrcRegOperands() { return target.isVectorRegister(); } + int numDstRegOperands() { return 0; } + }; + + GPUStaticInst* decodeBrn(const Brig::BrigInstBase *ib, + const BrigObject *obj); + + template<typename TargetType> + class CbrInstBase : public HsailGPUStaticInst + { + public: + void generateDisassembly(); + + Brig::BrigWidth8_t width; + CRegOperand cond; + TargetType target; + + CbrInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj) + : HsailGPUStaticInst(obj, "cbr") + { + o_type = Enums::OT_BRANCH; + width = ((Brig::BrigInstBr *)ib)->width; + unsigned op_offs = obj->getOperandPtr(ib->operands, 0); + cond.init(op_offs, obj); + op_offs = obj->getOperandPtr(ib->operands, 1); + target.init(op_offs, obj); + o_type = Enums::OT_BRANCH; + } + + uint32_t getTargetPc() override { return target.getTarget(0, 0); } + + void execute(GPUDynInstPtr gpuDynInst); + // Assumption: Target is operand 0, Condition Register is operand 1 + bool isVectorRegister(int operandIndex) { + assert(operandIndex >= 0 && operandIndex < getNumOperands()); + if (!operandIndex) + return target.isVectorRegister(); + else + return false; + } + bool isCondRegister(int operandIndex) { + assert(operandIndex >= 0 && operandIndex < getNumOperands()); + if (!operandIndex) + return target.isCondRegister(); + else + return true; + } + bool isScalarRegister(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (!operandIndex) + return target.isScalarRegister(); + else + return false; + } + bool isSrcOperand(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (operandIndex == 0) + return true; + return false; + } + // both Condition Register and Target are source operands + bool isDstOperand(int operandIndex) { + return false; + } + int getOperandSize(int operandIndex) { + assert(operandIndex >= 0 && operandIndex < getNumOperands()); + if (!operandIndex) + return target.opSize(); + else + return 1; + } + int getRegisterIndex(int operandIndex) { + assert(operandIndex >= 0 && operandIndex < getNumOperands()); + if (!operandIndex) + return target.regIndex(); + else + return -1; + } + + // Operands = Target, Condition Register + int getNumOperands() { + return 2; + } + }; + + template<typename TargetType> + void + CbrInstBase<TargetType>::generateDisassembly() + { + std::string widthClause; + + if (width != 1) { + widthClause = csprintf("_width(%d)", width); + } + + disassembly = csprintf("%s%s %s,%s", opcode, widthClause, + cond.disassemble(), target.disassemble()); + } + + template<typename TargetType> + void + CbrInstBase<TargetType>::execute(GPUDynInstPtr gpuDynInst) + { + Wavefront *w = gpuDynInst->wavefront(); + + const uint32_t curr_pc = w->pc(); + const uint32_t curr_rpc = w->rpc(); + const VectorMask curr_mask = w->execMask(); + + /** + * TODO: can we move this pop outside the instruction, and + * into the wavefront? + */ + w->popFromReconvergenceStack(); + + // immediate post-dominator instruction + const uint32_t rpc = static_cast<uint32_t>(ipdInstNum()); + if (curr_rpc != rpc) { + w->pushToReconvergenceStack(rpc, curr_rpc, curr_mask); + } + + // taken branch + const uint32_t true_pc = getTargetPc(); + VectorMask true_mask; + for (unsigned int lane = 0; lane < VSZ; ++lane) { + true_mask[lane] = cond.get<bool>(w, lane) & curr_mask[lane]; + } + + // not taken branch + const uint32_t false_pc = curr_pc + 1; + assert(true_pc != false_pc); + if (false_pc != rpc && true_mask.count() < curr_mask.count()) { + VectorMask false_mask = curr_mask & ~true_mask; + w->pushToReconvergenceStack(false_pc, rpc, false_mask); + } + + if (true_pc != rpc && true_mask.count()) { + w->pushToReconvergenceStack(true_pc, rpc, true_mask); + } + assert(w->pc() != curr_pc); + w->discardFetch(); + } + + + class CbrDirectInst : public CbrInstBase<LabelOperand> + { + public: + CbrDirectInst(const Brig::BrigInstBase *ib, const BrigObject *obj) + : CbrInstBase<LabelOperand>(ib, obj) + { + } + // the source operand of a conditional branch is a Condition + // Register which is not stored in the VRF + // so we do not count it as a source-register operand + // even though, formally, it is one. + int numSrcRegOperands() { return 0; } + int numDstRegOperands() { return 0; } + }; + + class CbrIndirectInst : public CbrInstBase<SRegOperand> + { + public: + CbrIndirectInst(const Brig::BrigInstBase *ib, const BrigObject *obj) + : CbrInstBase<SRegOperand>(ib, obj) + { + } + // one source operand of the conditional indirect branch is a Condition + // register which is not stored in the VRF so we do not count it + // as a source-register operand even though, formally, it is one. + int numSrcRegOperands() { return target.isVectorRegister(); } + int numDstRegOperands() { return 0; } + }; + + GPUStaticInst* decodeCbr(const Brig::BrigInstBase *ib, + const BrigObject *obj); + + template<typename TargetType> + class BrInstBase : public HsailGPUStaticInst + { + public: + void generateDisassembly(); + + ImmOperand<uint32_t> width; + TargetType target; + + BrInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj) + : HsailGPUStaticInst(obj, "br") + { + o_type = Enums::OT_BRANCH; + width.init(((Brig::BrigInstBr *)ib)->width, obj); + unsigned op_offs = obj->getOperandPtr(ib->operands, 0); + target.init(op_offs, obj); + o_type = Enums::OT_BRANCH; + } + + uint32_t getTargetPc() override { return target.getTarget(0, 0); } + + bool unconditionalJumpInstruction() override { return true; } + + void execute(GPUDynInstPtr gpuDynInst); + bool isVectorRegister(int operandIndex) { + assert(operandIndex >= 0 && operandIndex < getNumOperands()); + return target.isVectorRegister(); + } + bool isCondRegister(int operandIndex) { + assert(operandIndex >= 0 && operandIndex < getNumOperands()); + return target.isCondRegister(); + } + bool isScalarRegister(int operandIndex) { + assert(operandIndex >= 0 && operandIndex < getNumOperands()); + return target.isScalarRegister(); + } + bool isSrcOperand(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + return true; + } + bool isDstOperand(int operandIndex) { return false; } + int getOperandSize(int operandIndex) { + assert(operandIndex >= 0 && operandIndex < getNumOperands()); + return target.opSize(); + } + int getRegisterIndex(int operandIndex) { + assert(operandIndex >= 0 && operandIndex < getNumOperands()); + return target.regIndex(); + } + int getNumOperands() { return 1; } + }; + + template<typename TargetType> + void + BrInstBase<TargetType>::generateDisassembly() + { + std::string widthClause; + + if (width.bits != 1) { + widthClause = csprintf("_width(%d)", width.bits); + } + + disassembly = csprintf("%s%s %s", opcode, widthClause, + target.disassemble()); + } + + template<typename TargetType> + void + BrInstBase<TargetType>::execute(GPUDynInstPtr gpuDynInst) + { + Wavefront *w = gpuDynInst->wavefront(); + + if (getTargetPc() == w->rpc()) { + w->popFromReconvergenceStack(); + } else { + // Rpc and execution mask remain the same + w->pc(getTargetPc()); + } + w->discardFetch(); + } + + class BrDirectInst : public BrInstBase<LabelOperand> + { + public: + BrDirectInst(const Brig::BrigInstBase *ib, const BrigObject *obj) + : BrInstBase<LabelOperand>(ib, obj) + { + } + + int numSrcRegOperands() { return 0; } + int numDstRegOperands() { return 0; } + }; + + class BrIndirectInst : public BrInstBase<SRegOperand> + { + public: + BrIndirectInst(const Brig::BrigInstBase *ib, const BrigObject *obj) + : BrInstBase<SRegOperand>(ib, obj) + { + } + int numSrcRegOperands() { return target.isVectorRegister(); } + int numDstRegOperands() { return 0; } + }; + + GPUStaticInst* decodeBr(const Brig::BrigInstBase *ib, + const BrigObject *obj); +} // namespace HsailISA + +#endif // __ARCH_HSAIL_INSTS_BRANCH_HH__ diff --git a/src/arch/hsail/insts/decl.hh b/src/arch/hsail/insts/decl.hh new file mode 100644 index 000000000..e2da501b9 --- /dev/null +++ b/src/arch/hsail/insts/decl.hh @@ -0,0 +1,1106 @@ +/* + * Copyright (c) 2012-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Steve Reinhardt + */ + +#ifndef __ARCH_HSAIL_INSTS_DECL_HH__ +#define __ARCH_HSAIL_INSTS_DECL_HH__ + +#include <cmath> + +#include "arch/hsail/generic_types.hh" +#include "arch/hsail/insts/gpu_static_inst.hh" +#include "arch/hsail/operand.hh" +#include "debug/HSAIL.hh" +#include "enums/OpType.hh" +#include "gpu-compute/gpu_dyn_inst.hh" +#include "gpu-compute/shader.hh" + +namespace HsailISA +{ + template<typename _DestOperand, typename _SrcOperand> + class HsailOperandType + { + public: + typedef _DestOperand DestOperand; + typedef _SrcOperand SrcOperand; + }; + + typedef HsailOperandType<CRegOperand, CRegOrImmOperand> CRegOperandType; + typedef HsailOperandType<SRegOperand, SRegOrImmOperand> SRegOperandType; + typedef HsailOperandType<DRegOperand, DRegOrImmOperand> DRegOperandType; + + // The IsBits parameter serves only to disambiguate tbhe B* types from + // the U* types, which otherwise would be identical (and + // indistinguishable). + template<typename _OperandType, typename _CType, Enums::MemType _memType, + vgpr_type _vgprType, int IsBits=0> + class HsailDataType + { + public: + typedef _OperandType OperandType; + typedef _CType CType; + static const Enums::MemType memType = _memType; + static const vgpr_type vgprType = _vgprType; + static const char *label; + }; + + typedef HsailDataType<CRegOperandType, bool, Enums::M_U8, VT_32, 1> B1; + typedef HsailDataType<SRegOperandType, uint8_t, Enums::M_U8, VT_32, 1> B8; + + typedef HsailDataType<SRegOperandType, uint16_t, + Enums::M_U16, VT_32, 1> B16; + + typedef HsailDataType<SRegOperandType, uint32_t, + Enums::M_U32, VT_32, 1> B32; + + typedef HsailDataType<DRegOperandType, uint64_t, + Enums::M_U64, VT_64, 1> B64; + + typedef HsailDataType<SRegOperandType, int8_t, Enums::M_S8, VT_32> S8; + typedef HsailDataType<SRegOperandType, int16_t, Enums::M_S16, VT_32> S16; + typedef HsailDataType<SRegOperandType, int32_t, Enums::M_S32, VT_32> S32; + typedef HsailDataType<DRegOperandType, int64_t, Enums::M_S64, VT_64> S64; + + typedef HsailDataType<SRegOperandType, uint8_t, Enums::M_U8, VT_32> U8; + typedef HsailDataType<SRegOperandType, uint16_t, Enums::M_U16, VT_32> U16; + typedef HsailDataType<SRegOperandType, uint32_t, Enums::M_U32, VT_32> U32; + typedef HsailDataType<DRegOperandType, uint64_t, Enums::M_U64, VT_64> U64; + + typedef HsailDataType<SRegOperandType, float, Enums::M_F32, VT_32> F32; + typedef HsailDataType<DRegOperandType, double, Enums::M_F64, VT_64> F64; + + template<typename DestOperandType, typename SrcOperandType, + int NumSrcOperands> + class CommonInstBase : public HsailGPUStaticInst + { + protected: + typename DestOperandType::DestOperand dest; + typename SrcOperandType::SrcOperand src[NumSrcOperands]; + + void + generateDisassembly() + { + disassembly = csprintf("%s%s %s", opcode, opcode_suffix(), + dest.disassemble()); + + for (int i = 0; i < NumSrcOperands; ++i) { + disassembly += ","; + disassembly += src[i].disassemble(); + } + } + + virtual std::string opcode_suffix() = 0; + + public: + CommonInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj, + const char *opcode) + : HsailGPUStaticInst(obj, opcode) + { + unsigned op_offs = obj->getOperandPtr(ib->operands, 0); + + dest.init(op_offs, obj); + + for (int i = 0; i < NumSrcOperands; ++i) { + op_offs = obj->getOperandPtr(ib->operands, i + 1); + src[i].init(op_offs, obj); + } + } + + bool isVectorRegister(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (operandIndex < NumSrcOperands) + return src[operandIndex].isVectorRegister(); + else + return dest.isVectorRegister(); + } + bool isCondRegister(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (operandIndex < NumSrcOperands) + return src[operandIndex].isCondRegister(); + else + return dest.isCondRegister(); + } + bool isScalarRegister(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (operandIndex < NumSrcOperands) + return src[operandIndex].isScalarRegister(); + else + return dest.isScalarRegister(); + } + bool isSrcOperand(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (operandIndex < NumSrcOperands) + return true; + return false; + } + + bool isDstOperand(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (operandIndex >= NumSrcOperands) + return true; + return false; + } + int getOperandSize(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (operandIndex < NumSrcOperands) + return src[operandIndex].opSize(); + else + return dest.opSize(); + } + int getRegisterIndex(int operandIndex) { + assert(operandIndex >= 0 && operandIndex < getNumOperands()); + + if (operandIndex < NumSrcOperands) + return src[operandIndex].regIndex(); + else + return dest.regIndex(); + } + int numSrcRegOperands() { + int operands = 0; + for (int i = 0; i < NumSrcOperands; i++) { + if (src[i].isVectorRegister() == true) { + operands++; + } + } + return operands; + } + int numDstRegOperands() { return dest.isVectorRegister(); } + int getNumOperands() { return NumSrcOperands + 1; } + }; + + template<typename DataType, int NumSrcOperands> + class ArithInst : public CommonInstBase<typename DataType::OperandType, + typename DataType::OperandType, + NumSrcOperands> + { + public: + std::string opcode_suffix() { return csprintf("_%s", DataType::label); } + + ArithInst(const Brig::BrigInstBase *ib, const BrigObject *obj, + const char *opcode) + : CommonInstBase<typename DataType::OperandType, + typename DataType::OperandType, + NumSrcOperands>(ib, obj, opcode) + { + } + }; + + template<typename DestOperandType, typename Src0OperandType, + typename Src1OperandType, typename Src2OperandType> + class ThreeNonUniformSourceInstBase : public HsailGPUStaticInst + { + protected: + typename DestOperandType::DestOperand dest; + typename Src0OperandType::SrcOperand src0; + typename Src1OperandType::SrcOperand src1; + typename Src2OperandType::SrcOperand src2; + + void + generateDisassembly() + { + disassembly = csprintf("%s %s,%s,%s,%s", opcode, dest.disassemble(), + src0.disassemble(), src1.disassemble(), + src2.disassemble()); + } + + public: + ThreeNonUniformSourceInstBase(const Brig::BrigInstBase *ib, + const BrigObject *obj, + const char *opcode) + : HsailGPUStaticInst(obj, opcode) + { + unsigned op_offs = obj->getOperandPtr(ib->operands, 0); + dest.init(op_offs, obj); + + op_offs = obj->getOperandPtr(ib->operands, 1); + src0.init(op_offs, obj); + + op_offs = obj->getOperandPtr(ib->operands, 2); + src1.init(op_offs, obj); + + op_offs = obj->getOperandPtr(ib->operands, 3); + src2.init(op_offs, obj); + } + + bool isVectorRegister(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (!operandIndex) + return src0.isVectorRegister(); + else if (operandIndex == 1) + return src1.isVectorRegister(); + else if (operandIndex == 2) + return src2.isVectorRegister(); + else + return dest.isVectorRegister(); + } + bool isCondRegister(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (!operandIndex) + return src0.isCondRegister(); + else if (operandIndex == 1) + return src1.isCondRegister(); + else if (operandIndex == 2) + return src2.isCondRegister(); + else + return dest.isCondRegister(); + } + bool isScalarRegister(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (!operandIndex) + return src0.isScalarRegister(); + else if (operandIndex == 1) + return src1.isScalarRegister(); + else if (operandIndex == 2) + return src2.isScalarRegister(); + else + return dest.isScalarRegister(); + } + bool isSrcOperand(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (operandIndex < 3) + return true; + else + return false; + } + bool isDstOperand(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (operandIndex >= 3) + return true; + else + return false; + } + int getOperandSize(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (!operandIndex) + return src0.opSize(); + else if (operandIndex == 1) + return src1.opSize(); + else if (operandIndex == 2) + return src2.opSize(); + else + return dest.opSize(); + } + int getRegisterIndex(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (!operandIndex) + return src0.regIndex(); + else if (operandIndex == 1) + return src1.regIndex(); + else if (operandIndex == 2) + return src2.regIndex(); + else + return dest.regIndex(); + } + + int numSrcRegOperands() { + int operands = 0; + if (src0.isVectorRegister() == true) { + operands++; + } + if (src1.isVectorRegister() == true) { + operands++; + } + if (src2.isVectorRegister() == true) { + operands++; + } + return operands; + } + int numDstRegOperands() { return dest.isVectorRegister(); } + int getNumOperands() { return 4; } + }; + + template<typename DestDataType, typename Src0DataType, + typename Src1DataType, typename Src2DataType> + class ThreeNonUniformSourceInst : + public ThreeNonUniformSourceInstBase<typename DestDataType::OperandType, + typename Src0DataType::OperandType, + typename Src1DataType::OperandType, + typename Src2DataType::OperandType> + { + public: + typedef typename DestDataType::CType DestCType; + typedef typename Src0DataType::CType Src0CType; + typedef typename Src1DataType::CType Src1CType; + typedef typename Src2DataType::CType Src2CType; + + ThreeNonUniformSourceInst(const Brig::BrigInstBase *ib, + const BrigObject *obj, const char *opcode) + : ThreeNonUniformSourceInstBase<typename DestDataType::OperandType, + typename Src0DataType::OperandType, + typename Src1DataType::OperandType, + typename Src2DataType::OperandType>(ib, + obj, opcode) + { + } + }; + + template<typename DataType> + class CmovInst : public ThreeNonUniformSourceInst<DataType, B1, + DataType, DataType> + { + public: + CmovInst(const Brig::BrigInstBase *ib, const BrigObject *obj, + const char *opcode) + : ThreeNonUniformSourceInst<DataType, B1, DataType, + DataType>(ib, obj, opcode) + { + } + }; + + template<typename DataType> + class ExtractInsertInst : public ThreeNonUniformSourceInst<DataType, + DataType, U32, + U32> + { + public: + ExtractInsertInst(const Brig::BrigInstBase *ib, const BrigObject *obj, + const char *opcode) + : ThreeNonUniformSourceInst<DataType, DataType, U32, + U32>(ib, obj, opcode) + { + } + }; + + template<typename DestOperandType, typename Src0OperandType, + typename Src1OperandType> + class TwoNonUniformSourceInstBase : public HsailGPUStaticInst + { + protected: + typename DestOperandType::DestOperand dest; + typename Src0OperandType::SrcOperand src0; + typename Src1OperandType::SrcOperand src1; + + void + generateDisassembly() + { + disassembly = csprintf("%s %s,%s,%s", opcode, dest.disassemble(), + src0.disassemble(), src1.disassemble()); + } + + + public: + TwoNonUniformSourceInstBase(const Brig::BrigInstBase *ib, + const BrigObject *obj, const char *opcode) + : HsailGPUStaticInst(obj, opcode) + { + unsigned op_offs = obj->getOperandPtr(ib->operands, 0); + dest.init(op_offs, obj); + + op_offs = obj->getOperandPtr(ib->operands, 1); + src0.init(op_offs, obj); + + op_offs = obj->getOperandPtr(ib->operands, 2); + src1.init(op_offs, obj); + } + bool isVectorRegister(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (!operandIndex) + return src0.isVectorRegister(); + else if (operandIndex == 1) + return src1.isVectorRegister(); + else + return dest.isVectorRegister(); + } + bool isCondRegister(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (!operandIndex) + return src0.isCondRegister(); + else if (operandIndex == 1) + return src1.isCondRegister(); + else + return dest.isCondRegister(); + } + bool isScalarRegister(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (!operandIndex) + return src0.isScalarRegister(); + else if (operandIndex == 1) + return src1.isScalarRegister(); + else + return dest.isScalarRegister(); + } + bool isSrcOperand(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (operandIndex < 2) + return true; + else + return false; + } + bool isDstOperand(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (operandIndex >= 2) + return true; + else + return false; + } + int getOperandSize(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (!operandIndex) + return src0.opSize(); + else if (operandIndex == 1) + return src1.opSize(); + else + return dest.opSize(); + } + int getRegisterIndex(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (!operandIndex) + return src0.regIndex(); + else if (operandIndex == 1) + return src1.regIndex(); + else + return dest.regIndex(); + } + + int numSrcRegOperands() { + int operands = 0; + if (src0.isVectorRegister() == true) { + operands++; + } + if (src1.isVectorRegister() == true) { + operands++; + } + return operands; + } + int numDstRegOperands() { return dest.isVectorRegister(); } + int getNumOperands() { return 3; } + }; + + template<typename DestDataType, typename Src0DataType, + typename Src1DataType> + class TwoNonUniformSourceInst : + public TwoNonUniformSourceInstBase<typename DestDataType::OperandType, + typename Src0DataType::OperandType, + typename Src1DataType::OperandType> + { + public: + typedef typename DestDataType::CType DestCType; + typedef typename Src0DataType::CType Src0CType; + typedef typename Src1DataType::CType Src1CType; + + TwoNonUniformSourceInst(const Brig::BrigInstBase *ib, + const BrigObject *obj, const char *opcode) + : TwoNonUniformSourceInstBase<typename DestDataType::OperandType, + typename Src0DataType::OperandType, + typename Src1DataType::OperandType>(ib, + obj, opcode) + { + } + }; + + // helper function for ClassInst + template<typename T> + bool + fpclassify(T src0, uint32_t src1) + { + int fpclass = std::fpclassify(src0); + + if ((src1 & 0x3) && (fpclass == FP_NAN)) { + return true; + } + + if (src0 <= -0.0) { + if ((src1 & 0x4) && fpclass == FP_INFINITE) + return true; + if ((src1 & 0x8) && fpclass == FP_NORMAL) + return true; + if ((src1 & 0x10) && fpclass == FP_SUBNORMAL) + return true; + if ((src1 & 0x20) && fpclass == FP_ZERO) + return true; + } else { + if ((src1 & 0x40) && fpclass == FP_ZERO) + return true; + if ((src1 & 0x80) && fpclass == FP_SUBNORMAL) + return true; + if ((src1 & 0x100) && fpclass == FP_NORMAL) + return true; + if ((src1 & 0x200) && fpclass == FP_INFINITE) + return true; + } + return false; + } + + template<typename DataType> + class ClassInst : public TwoNonUniformSourceInst<B1, DataType, U32> + { + public: + ClassInst(const Brig::BrigInstBase *ib, const BrigObject *obj, + const char *opcode) + : TwoNonUniformSourceInst<B1, DataType, U32>(ib, obj, opcode) + { + } + }; + + template<typename DataType> + class ShiftInst : public TwoNonUniformSourceInst<DataType, DataType, U32> + { + public: + ShiftInst(const Brig::BrigInstBase *ib, const BrigObject *obj, + const char *opcode) + : TwoNonUniformSourceInst<DataType, DataType, U32>(ib, obj, opcode) + { + } + }; + + // helper function for CmpInst + template<typename T> + bool + compare(T src0, T src1, Brig::BrigCompareOperation cmpOp) + { + using namespace Brig; + + switch (cmpOp) { + case BRIG_COMPARE_EQ: + case BRIG_COMPARE_EQU: + case BRIG_COMPARE_SEQ: + case BRIG_COMPARE_SEQU: + return (src0 == src1); + + case BRIG_COMPARE_NE: + case BRIG_COMPARE_NEU: + case BRIG_COMPARE_SNE: + case BRIG_COMPARE_SNEU: + return (src0 != src1); + + case BRIG_COMPARE_LT: + case BRIG_COMPARE_LTU: + case BRIG_COMPARE_SLT: + case BRIG_COMPARE_SLTU: + return (src0 < src1); + + case BRIG_COMPARE_LE: + case BRIG_COMPARE_LEU: + case BRIG_COMPARE_SLE: + case BRIG_COMPARE_SLEU: + return (src0 <= src1); + + case BRIG_COMPARE_GT: + case BRIG_COMPARE_GTU: + case BRIG_COMPARE_SGT: + case BRIG_COMPARE_SGTU: + return (src0 > src1); + + case BRIG_COMPARE_GE: + case BRIG_COMPARE_GEU: + case BRIG_COMPARE_SGE: + case BRIG_COMPARE_SGEU: + return (src0 >= src1); + + case BRIG_COMPARE_NUM: + case BRIG_COMPARE_SNUM: + return (src0 == src0) || (src1 == src1); + + case BRIG_COMPARE_NAN: + case BRIG_COMPARE_SNAN: + return (src0 != src0) || (src1 != src1); + + default: + fatal("Bad cmpOp value %d\n", (int)cmpOp); + } + } + + template<typename T> + int32_t + firstbit(T src0) + { + if (!src0) + return -1; + + //handle positive and negative numbers + T tmp = (src0 < 0) ? (~src0) : (src0); + + //the starting pos is MSB + int pos = 8 * sizeof(T) - 1; + int cnt = 0; + + //search the first bit set to 1 + while (!(tmp & (1 << pos))) { + ++cnt; + --pos; + } + return cnt; + } + + const char* cmpOpToString(Brig::BrigCompareOperation cmpOp); + + template<typename DestOperandType, typename SrcOperandType> + class CmpInstBase : public CommonInstBase<DestOperandType, SrcOperandType, + 2> + { + protected: + Brig::BrigCompareOperation cmpOp; + + public: + CmpInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj, + const char *_opcode) + : CommonInstBase<DestOperandType, SrcOperandType, 2>(ib, obj, + _opcode) + { + assert(ib->base.kind == Brig::BRIG_KIND_INST_CMP); + Brig::BrigInstCmp *i = (Brig::BrigInstCmp*)ib; + cmpOp = (Brig::BrigCompareOperation)i->compare; + } + }; + + template<typename DestDataType, typename SrcDataType> + class CmpInst : public CmpInstBase<typename DestDataType::OperandType, + typename SrcDataType::OperandType> + { + public: + std::string + opcode_suffix() + { + return csprintf("_%s_%s_%s", cmpOpToString(this->cmpOp), + DestDataType::label, SrcDataType::label); + } + + CmpInst(const Brig::BrigInstBase *ib, const BrigObject *obj, + const char *_opcode) + : CmpInstBase<typename DestDataType::OperandType, + typename SrcDataType::OperandType>(ib, obj, _opcode) + { + } + }; + + template<typename DestDataType, typename SrcDataType> + class CvtInst : public CommonInstBase<typename DestDataType::OperandType, + typename SrcDataType::OperandType, 1> + { + public: + std::string opcode_suffix() + { + return csprintf("_%s_%s", DestDataType::label, SrcDataType::label); + } + + CvtInst(const Brig::BrigInstBase *ib, const BrigObject *obj, + const char *_opcode) + : CommonInstBase<typename DestDataType::OperandType, + typename SrcDataType::OperandType, + 1>(ib, obj, _opcode) + { + } + }; + + class SpecialInstNoSrcNoDest : public HsailGPUStaticInst + { + public: + SpecialInstNoSrcNoDest(const Brig::BrigInstBase *ib, + const BrigObject *obj, const char *_opcode) + : HsailGPUStaticInst(obj, _opcode) + { + } + + bool isVectorRegister(int operandIndex) { return false; } + bool isCondRegister(int operandIndex) { return false; } + bool isScalarRegister(int operandIndex) { return false; } + bool isSrcOperand(int operandIndex) { return false; } + bool isDstOperand(int operandIndex) { return false; } + int getOperandSize(int operandIndex) { return 0; } + int getRegisterIndex(int operandIndex) { return -1; } + + int numSrcRegOperands() { return 0; } + int numDstRegOperands() { return 0; } + int getNumOperands() { return 0; } + }; + + template<typename DestOperandType> + class SpecialInstNoSrcBase : public HsailGPUStaticInst + { + protected: + typename DestOperandType::DestOperand dest; + + void generateDisassembly() + { + disassembly = csprintf("%s %s", opcode, dest.disassemble()); + } + + public: + SpecialInstNoSrcBase(const Brig::BrigInstBase *ib, + const BrigObject *obj, const char *_opcode) + : HsailGPUStaticInst(obj, _opcode) + { + unsigned op_offs = obj->getOperandPtr(ib->operands, 0); + dest.init(op_offs, obj); + } + + bool isVectorRegister(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + return dest.isVectorRegister(); + } + bool isCondRegister(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + return dest.isCondRegister(); + } + bool isScalarRegister(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + return dest.isScalarRegister(); + } + bool isSrcOperand(int operandIndex) { return false; } + bool isDstOperand(int operandIndex) { return true; } + int getOperandSize(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + return dest.opSize(); + } + int getRegisterIndex(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + return dest.regIndex(); + } + int numSrcRegOperands() { return 0; } + int numDstRegOperands() { return dest.isVectorRegister(); } + int getNumOperands() { return 1; } + }; + + template<typename DestDataType> + class SpecialInstNoSrc : + public SpecialInstNoSrcBase<typename DestDataType::OperandType> + { + public: + typedef typename DestDataType::CType DestCType; + + SpecialInstNoSrc(const Brig::BrigInstBase *ib, const BrigObject *obj, + const char *_opcode) + : SpecialInstNoSrcBase<typename DestDataType::OperandType>(ib, obj, + _opcode) + { + } + }; + + template<typename DestOperandType> + class SpecialInst1SrcBase : public HsailGPUStaticInst + { + protected: + typedef int SrcCType; // used in execute() template + + typename DestOperandType::DestOperand dest; + ImmOperand<SrcCType> src0; + + void + generateDisassembly() + { + disassembly = csprintf("%s %s,%s", opcode, dest.disassemble(), + src0.disassemble()); + } + + public: + SpecialInst1SrcBase(const Brig::BrigInstBase *ib, + const BrigObject *obj, const char *_opcode) + : HsailGPUStaticInst(obj, _opcode) + { + unsigned op_offs = obj->getOperandPtr(ib->operands, 0); + dest.init(op_offs, obj); + + op_offs = obj->getOperandPtr(ib->operands, 1); + src0.init(op_offs, obj); + } + bool isVectorRegister(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + return dest.isVectorRegister(); + } + bool isCondRegister(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + return dest.isCondRegister(); + } + bool isScalarRegister(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + return dest.isScalarRegister(); + } + bool isSrcOperand(int operandIndex) { return false; } + bool isDstOperand(int operandIndex) { return true; } + int getOperandSize(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + return dest.opSize(); + } + int getRegisterIndex(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + return dest.regIndex(); + } + int numSrcRegOperands() { return 0; } + int numDstRegOperands() { return dest.isVectorRegister(); } + int getNumOperands() { return 1; } + }; + + template<typename DestDataType> + class SpecialInst1Src : + public SpecialInst1SrcBase<typename DestDataType::OperandType> + { + public: + typedef typename DestDataType::CType DestCType; + + SpecialInst1Src(const Brig::BrigInstBase *ib, const BrigObject *obj, + const char *_opcode) + : SpecialInst1SrcBase<typename DestDataType::OperandType>(ib, obj, + _opcode) + { + } + }; + + class Ret : public SpecialInstNoSrcNoDest + { + public: + typedef SpecialInstNoSrcNoDest Base; + + Ret(const Brig::BrigInstBase *ib, const BrigObject *obj) + : Base(ib, obj, "ret") + { + o_type = Enums::OT_RET; + } + + void execute(GPUDynInstPtr gpuDynInst); + }; + + class Barrier : public SpecialInstNoSrcNoDest + { + public: + typedef SpecialInstNoSrcNoDest Base; + uint8_t width; + + Barrier(const Brig::BrigInstBase *ib, const BrigObject *obj) + : Base(ib, obj, "barrier") + { + o_type = Enums::OT_BARRIER; + assert(ib->base.kind == Brig::BRIG_KIND_INST_BR); + width = (uint8_t)((Brig::BrigInstBr*)ib)->width; + } + + void execute(GPUDynInstPtr gpuDynInst); + }; + + class MemFence : public SpecialInstNoSrcNoDest + { + public: + typedef SpecialInstNoSrcNoDest Base; + + Brig::BrigMemoryOrder memFenceMemOrder; + Brig::BrigMemoryScope memFenceScopeSegGroup; + Brig::BrigMemoryScope memFenceScopeSegGlobal; + Brig::BrigMemoryScope memFenceScopeSegImage; + + MemFence(const Brig::BrigInstBase *ib, const BrigObject *obj) + : Base(ib, obj, "memfence") + { + assert(ib->base.kind == Brig::BRIG_KIND_INST_MEM_FENCE); + + memFenceScopeSegGlobal = (Brig::BrigMemoryScope) + ((Brig::BrigInstMemFence*)ib)->globalSegmentMemoryScope; + + memFenceScopeSegGroup = (Brig::BrigMemoryScope) + ((Brig::BrigInstMemFence*)ib)->groupSegmentMemoryScope; + + memFenceScopeSegImage = (Brig::BrigMemoryScope) + ((Brig::BrigInstMemFence*)ib)->imageSegmentMemoryScope; + + memFenceMemOrder = (Brig::BrigMemoryOrder) + ((Brig::BrigInstMemFence*)ib)->memoryOrder; + + // set o_type based on scopes + if (memFenceScopeSegGlobal != Brig::BRIG_MEMORY_SCOPE_NONE && + memFenceScopeSegGroup != Brig::BRIG_MEMORY_SCOPE_NONE) { + o_type = Enums::OT_BOTH_MEMFENCE; + } else if (memFenceScopeSegGlobal != Brig::BRIG_MEMORY_SCOPE_NONE) { + o_type = Enums::OT_GLOBAL_MEMFENCE; + } else if (memFenceScopeSegGroup != Brig::BRIG_MEMORY_SCOPE_NONE) { + o_type = Enums::OT_SHARED_MEMFENCE; + } else { + fatal("MemFence constructor: bad scope specifiers\n"); + } + } + + void + initiateAcc(GPUDynInstPtr gpuDynInst) + { + Wavefront *wave = gpuDynInst->wavefront(); + wave->computeUnit->injectGlobalMemFence(gpuDynInst); + } + + void + execute(GPUDynInstPtr gpuDynInst) + { + Wavefront *w = gpuDynInst->wavefront(); + // 2 cases: + // * memfence to a sequentially consistent memory (e.g., LDS). + // These can be handled as no-ops. + // * memfence to a relaxed consistency cache (e.g., Hermes, Viper, + // etc.). We send a packet, tagged with the memory order and + // scope, and let the GPU coalescer handle it. + + if (o_type == Enums::OT_GLOBAL_MEMFENCE || + o_type == Enums::OT_BOTH_MEMFENCE) { + gpuDynInst->simdId = w->simdId; + gpuDynInst->wfSlotId = w->wfSlotId; + gpuDynInst->wfDynId = w->wfDynId; + gpuDynInst->kern_id = w->kern_id; + gpuDynInst->cu_id = w->computeUnit->cu_id; + + gpuDynInst->memoryOrder = + getGenericMemoryOrder(memFenceMemOrder); + gpuDynInst->scope = + getGenericMemoryScope(memFenceScopeSegGlobal); + gpuDynInst->useContinuation = false; + GlobalMemPipeline* gmp = &(w->computeUnit->globalMemoryPipe); + gmp->getGMReqFIFO().push(gpuDynInst); + + w->wr_gm_reqs_in_pipe--; + w->rd_gm_reqs_in_pipe--; + w->mem_reqs_in_pipe--; + w->outstanding_reqs++; + } else if (o_type == Enums::OT_SHARED_MEMFENCE) { + // no-op + } else { + fatal("MemFence execute: bad o_type\n"); + } + } + }; + + class Call : public HsailGPUStaticInst + { + public: + // private helper functions + void calcAddr(Wavefront* w, GPUDynInstPtr m); + + void + generateDisassembly() + { + if (dest.disassemble() == "") { + disassembly = csprintf("%s %s (%s)", opcode, src0.disassemble(), + src1.disassemble()); + } else { + disassembly = csprintf("%s %s (%s) (%s)", opcode, + src0.disassemble(), dest.disassemble(), + src1.disassemble()); + } + } + + bool + isPseudoOp() + { + std::string func_name = src0.disassemble(); + if (func_name.find("__gem5_hsail_op") != std::string::npos) { + return true; + } + return false; + } + + // member variables + ListOperand dest; + FunctionRefOperand src0; + ListOperand src1; + HsailCode *func_ptr; + + // exec function for pseudo instructions mapped on top of call opcode + void execPseudoInst(Wavefront *w, GPUDynInstPtr gpuDynInst); + + // user-defined pseudo instructions + void MagicPrintLane(Wavefront *w); + void MagicPrintLane64(Wavefront *w); + void MagicPrintWF32(Wavefront *w); + void MagicPrintWF64(Wavefront *w); + void MagicPrintWFFloat(Wavefront *w); + void MagicSimBreak(Wavefront *w); + void MagicPrefixSum(Wavefront *w); + void MagicReduction(Wavefront *w); + void MagicMaskLower(Wavefront *w); + void MagicMaskUpper(Wavefront *w); + void MagicJoinWFBar(Wavefront *w); + void MagicWaitWFBar(Wavefront *w); + void MagicPanic(Wavefront *w); + + void MagicAtomicNRAddGlobalU32Reg(Wavefront *w, + GPUDynInstPtr gpuDynInst); + + void MagicAtomicNRAddGroupU32Reg(Wavefront *w, + GPUDynInstPtr gpuDynInst); + + void MagicLoadGlobalU32Reg(Wavefront *w, GPUDynInstPtr gpuDynInst); + + void MagicXactCasLd(Wavefront *w); + void MagicMostSigThread(Wavefront *w); + void MagicMostSigBroadcast(Wavefront *w); + + void MagicPrintWF32ID(Wavefront *w); + void MagicPrintWFID64(Wavefront *w); + + Call(const Brig::BrigInstBase *ib, const BrigObject *obj) + : HsailGPUStaticInst(obj, "call") + { + unsigned op_offs = obj->getOperandPtr(ib->operands, 0); + dest.init(op_offs, obj); + op_offs = obj->getOperandPtr(ib->operands, 1); + src0.init(op_offs, obj); + + func_ptr = nullptr; + std::string func_name = src0.disassemble(); + if (!isPseudoOp()) { + func_ptr = dynamic_cast<HsailCode*>(obj-> + getFunction(func_name)); + + if (!func_ptr) + fatal("call::exec cannot find function: %s\n", func_name); + } + + op_offs = obj->getOperandPtr(ib->operands, 2); + src1.init(op_offs, obj); + } + + bool isVectorRegister(int operandIndex) { return false; } + bool isCondRegister(int operandIndex) { return false; } + bool isScalarRegister(int operandIndex) { return false; } + bool isSrcOperand(int operandIndex) { return false; } + bool isDstOperand(int operandIndex) { return false; } + int getOperandSize(int operandIndex) { return 0; } + int getRegisterIndex(int operandIndex) { return -1; } + + void + execute(GPUDynInstPtr gpuDynInst) + { + Wavefront *w = gpuDynInst->wavefront(); + + std::string func_name = src0.disassemble(); + if (isPseudoOp()) { + execPseudoInst(w, gpuDynInst); + } else { + fatal("Native HSAIL functions are not yet implemented: %s\n", + func_name); + } + } + int numSrcRegOperands() { return 0; } + int numDstRegOperands() { return 0; } + int getNumOperands() { return 2; } + }; + + template<typename T> T heynot(T arg) { return ~arg; } + template<> inline bool heynot<bool>(bool arg) { return !arg; } +} // namespace HsailISA + +#endif // __ARCH_HSAIL_INSTS_DECL_HH__ diff --git a/src/arch/hsail/insts/gpu_static_inst.cc b/src/arch/hsail/insts/gpu_static_inst.cc new file mode 100644 index 000000000..bbaeb13e6 --- /dev/null +++ b/src/arch/hsail/insts/gpu_static_inst.cc @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Anthony Gutierrez + */ + +#include "arch/hsail/insts/gpu_static_inst.hh" + +#include "gpu-compute/brig_object.hh" + +namespace HsailISA +{ + HsailGPUStaticInst::HsailGPUStaticInst(const BrigObject *obj, + const std::string &opcode) + : GPUStaticInst(opcode), hsailCode(obj->currentCode) + { + } + + void + HsailGPUStaticInst::generateDisassembly() + { + disassembly = opcode; + } + + const std::string& + HsailGPUStaticInst::disassemble() + { + if (disassembly.empty()) { + generateDisassembly(); + assert(!disassembly.empty()); + } + + return disassembly; + } +} // namespace HsailISA diff --git a/src/arch/hsail/insts/gpu_static_inst.hh b/src/arch/hsail/insts/gpu_static_inst.hh new file mode 100644 index 000000000..29aab1f70 --- /dev/null +++ b/src/arch/hsail/insts/gpu_static_inst.hh @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Anthony Gutierrez + */ + +#ifndef __ARCH_HSAIL_INSTS_GPU_STATIC_INST_HH__ +#define __ARCH_HSAIL_INSTS_GPU_STATIC_INST_HH__ + +/* + * @file gpu_static_inst.hh + * + * Defines the base class representing HSAIL GPU static instructions. + */ + +#include "gpu-compute/gpu_static_inst.hh" + +class BrigObject; +class HsailCode; + +namespace HsailISA +{ + class HsailGPUStaticInst : public GPUStaticInst + { + public: + HsailGPUStaticInst(const BrigObject *obj, const std::string &opcode); + void generateDisassembly(); + const std::string &disassemble(); + uint32_t instSize() { return 4; } + + protected: + HsailCode *hsailCode; + }; +} // namespace HsailISA + +#endif // __ARCH_HSAIL_INSTS_GPU_STATIC_INST_HH__ diff --git a/src/arch/hsail/insts/main.cc b/src/arch/hsail/insts/main.cc new file mode 100644 index 000000000..4e70bf46a --- /dev/null +++ b/src/arch/hsail/insts/main.cc @@ -0,0 +1,208 @@ +/* + * Copyright (c) 2012-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Steve Reinhardt + */ + +#include "arch/hsail/insts/decl.hh" +#include "debug/GPUExec.hh" +#include "gpu-compute/dispatcher.hh" +#include "gpu-compute/simple_pool_manager.hh" + +namespace HsailISA +{ + template<> const char *B1::label = "b1"; + template<> const char *B8::label = "b8"; + template<> const char *B16::label = "b16"; + template<> const char *B32::label = "b32"; + template<> const char *B64::label = "b64"; + + template<> const char *S8::label = "s8"; + template<> const char *S16::label = "s16"; + template<> const char *S32::label = "s32"; + template<> const char *S64::label = "s64"; + + template<> const char *U8::label = "u8"; + template<> const char *U16::label = "u16"; + template<> const char *U32::label = "u32"; + template<> const char *U64::label = "u64"; + + template<> const char *F32::label = "f32"; + template<> const char *F64::label = "f64"; + + const char* + cmpOpToString(Brig::BrigCompareOperation cmpOp) + { + using namespace Brig; + + switch (cmpOp) { + case BRIG_COMPARE_EQ: + return "eq"; + case BRIG_COMPARE_NE: + return "ne"; + case BRIG_COMPARE_LT: + return "lt"; + case BRIG_COMPARE_LE: + return "le"; + case BRIG_COMPARE_GT: + return "gt"; + case BRIG_COMPARE_GE: + return "ge"; + case BRIG_COMPARE_EQU: + return "equ"; + case BRIG_COMPARE_NEU: + return "neu"; + case BRIG_COMPARE_LTU: + return "ltu"; + case BRIG_COMPARE_LEU: + return "leu"; + case BRIG_COMPARE_GTU: + return "gtu"; + case BRIG_COMPARE_GEU: + return "geu"; + case BRIG_COMPARE_NUM: + return "num"; + case BRIG_COMPARE_NAN: + return "nan"; + case BRIG_COMPARE_SEQ: + return "seq"; + case BRIG_COMPARE_SNE: + return "sne"; + case BRIG_COMPARE_SLT: + return "slt"; + case BRIG_COMPARE_SLE: + return "sle"; + case BRIG_COMPARE_SGT: + return "sgt"; + case BRIG_COMPARE_SGE: + return "sge"; + case BRIG_COMPARE_SGEU: + return "sgeu"; + case BRIG_COMPARE_SEQU: + return "sequ"; + case BRIG_COMPARE_SNEU: + return "sneu"; + case BRIG_COMPARE_SLTU: + return "sltu"; + case BRIG_COMPARE_SLEU: + return "sleu"; + case BRIG_COMPARE_SNUM: + return "snum"; + case BRIG_COMPARE_SNAN: + return "snan"; + case BRIG_COMPARE_SGTU: + return "sgtu"; + default: + return "unknown"; + } + } + + void + Ret::execute(GPUDynInstPtr gpuDynInst) + { + Wavefront *w = gpuDynInst->wavefront(); + + const VectorMask &mask = w->get_pred(); + + // mask off completed work-items + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + w->init_mask[lane] = 0; + } + + } + + // delete extra instructions fetched for completed work-items + w->instructionBuffer.erase(w->instructionBuffer.begin() + 1, + w->instructionBuffer.end()); + if (w->pendingFetch) { + w->dropFetch = true; + } + + // if all work-items have completed, then wave-front is done + if (w->init_mask.none()) { + w->status = Wavefront::S_STOPPED; + + int32_t refCount = w->computeUnit->getLds(). + decreaseRefCounter(w->dispatchid, w->wg_id); + + DPRINTF(GPUExec, "CU%d: decrease ref ctr WG[%d] to [%d]\n", + w->computeUnit->cu_id, w->wg_id, refCount); + + // free the vector registers of the completed wavefront + w->computeUnit->vectorRegsReserved[w->simdId] -= + w->reservedVectorRegs; + + assert(w->computeUnit->vectorRegsReserved[w->simdId] >= 0); + + uint32_t endIndex = (w->startVgprIndex + + w->reservedVectorRegs - 1) % + w->computeUnit->vrf[w->simdId]->numRegs(); + + w->computeUnit->vrf[w->simdId]->manager-> + freeRegion(w->startVgprIndex, endIndex); + + w->reservedVectorRegs = 0; + w->startVgprIndex = 0; + w->computeUnit->completedWfs++; + + DPRINTF(GPUExec, "Doing return for CU%d: WF[%d][%d][%d]\n", + w->computeUnit->cu_id, w->simdId, w->wfSlotId, w->wfDynId); + + if (!refCount) { + // Notify Memory System of Kernel Completion + // Kernel End = isKernel + isRelease + w->status = Wavefront::S_RETURNING; + GPUDynInstPtr local_mempacket = gpuDynInst; + local_mempacket->memoryOrder = Enums::MEMORY_ORDER_SC_RELEASE; + local_mempacket->scope = Enums::MEMORY_SCOPE_SYSTEM; + local_mempacket->useContinuation = false; + local_mempacket->simdId = w->simdId; + local_mempacket->wfSlotId = w->wfSlotId; + local_mempacket->wfDynId = w->wfDynId; + w->computeUnit->injectGlobalMemFence(local_mempacket, true); + } else { + w->computeUnit->shader->dispatcher->scheduleDispatch(); + } + } + } + + void + Barrier::execute(GPUDynInstPtr gpuDynInst) + { + Wavefront *w = gpuDynInst->wavefront(); + + assert(w->barrier_cnt == w->old_barrier_cnt); + w->barrier_cnt = w->old_barrier_cnt + 1; + w->stalledAtBarrier = true; + } +} // namespace HsailISA diff --git a/src/arch/hsail/insts/mem.cc b/src/arch/hsail/insts/mem.cc new file mode 100644 index 000000000..97d4c902b --- /dev/null +++ b/src/arch/hsail/insts/mem.cc @@ -0,0 +1,139 @@ +/* + * Copyright (c) 2012-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Steve Reinhardt + */ + +#include "arch/hsail/insts/mem.hh" + +#include "arch/hsail/Brig.h" +#include "enums/OpType.hh" + +using namespace Brig; + +namespace HsailISA +{ + const char* atomicOpToString(BrigAtomicOperation brigOp); + + Enums::MemOpType + brigAtomicToMemOpType(BrigOpcode brigOpCode, BrigAtomicOperation brigOp) + { + if (brigOpCode == Brig::BRIG_OPCODE_ATOMIC) { + switch (brigOp) { + case BRIG_ATOMIC_AND: + return Enums::MO_AAND; + case BRIG_ATOMIC_OR: + return Enums::MO_AOR; + case BRIG_ATOMIC_XOR: + return Enums::MO_AXOR; + case BRIG_ATOMIC_CAS: + return Enums::MO_ACAS; + case BRIG_ATOMIC_EXCH: + return Enums::MO_AEXCH; + case BRIG_ATOMIC_ADD: + return Enums::MO_AADD; + case BRIG_ATOMIC_WRAPINC: + return Enums::MO_AINC; + case BRIG_ATOMIC_WRAPDEC: + return Enums::MO_ADEC; + case BRIG_ATOMIC_MIN: + return Enums::MO_AMIN; + case BRIG_ATOMIC_MAX: + return Enums::MO_AMAX; + case BRIG_ATOMIC_SUB: + return Enums::MO_ASUB; + default: + fatal("Bad BrigAtomicOperation code %d\n", brigOp); + } + } else if (brigOpCode == Brig::BRIG_OPCODE_ATOMICNORET) { + switch (brigOp) { + case BRIG_ATOMIC_AND: + return Enums::MO_ANRAND; + case BRIG_ATOMIC_OR: + return Enums::MO_ANROR; + case BRIG_ATOMIC_XOR: + return Enums::MO_ANRXOR; + case BRIG_ATOMIC_CAS: + return Enums::MO_ANRCAS; + case BRIG_ATOMIC_EXCH: + return Enums::MO_ANREXCH; + case BRIG_ATOMIC_ADD: + return Enums::MO_ANRADD; + case BRIG_ATOMIC_WRAPINC: + return Enums::MO_ANRINC; + case BRIG_ATOMIC_WRAPDEC: + return Enums::MO_ANRDEC; + case BRIG_ATOMIC_MIN: + return Enums::MO_ANRMIN; + case BRIG_ATOMIC_MAX: + return Enums::MO_ANRMAX; + case BRIG_ATOMIC_SUB: + return Enums::MO_ANRSUB; + default: + fatal("Bad BrigAtomicOperation code %d\n", brigOp); + } + } else { + fatal("Bad BrigAtomicOpcode %d\n", brigOpCode); + } + } + + const char* + atomicOpToString(BrigAtomicOperation brigOp) + { + switch (brigOp) { + case BRIG_ATOMIC_AND: + return "and"; + case BRIG_ATOMIC_OR: + return "or"; + case BRIG_ATOMIC_XOR: + return "xor"; + case BRIG_ATOMIC_CAS: + return "cas"; + case BRIG_ATOMIC_EXCH: + return "exch"; + case BRIG_ATOMIC_ADD: + return "add"; + case BRIG_ATOMIC_WRAPINC: + return "inc"; + case BRIG_ATOMIC_WRAPDEC: + return "dec"; + case BRIG_ATOMIC_MIN: + return "min"; + case BRIG_ATOMIC_MAX: + return "max"; + case BRIG_ATOMIC_SUB: + return "sub"; + default: + return "unknown"; + } + } +} // namespace HsailISA diff --git a/src/arch/hsail/insts/mem.hh b/src/arch/hsail/insts/mem.hh new file mode 100644 index 000000000..d3ce76dee --- /dev/null +++ b/src/arch/hsail/insts/mem.hh @@ -0,0 +1,1629 @@ +/* + * Copyright (c) 2012-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Steve Reinhardt + */ + +#ifndef __ARCH_HSAIL_INSTS_MEM_HH__ +#define __ARCH_HSAIL_INSTS_MEM_HH__ + +#include "arch/hsail/insts/decl.hh" +#include "arch/hsail/insts/gpu_static_inst.hh" +#include "arch/hsail/operand.hh" + +namespace HsailISA +{ + class MemInst + { + public: + MemInst() : size(0), addr_operand(nullptr) { } + + MemInst(Enums::MemType m_type) + { + if (m_type == Enums::M_U64 || + m_type == Enums::M_S64 || + m_type == Enums::M_F64) { + size = 8; + } else if (m_type == Enums::M_U32 || + m_type == Enums::M_S32 || + m_type == Enums::M_F32) { + size = 4; + } else if (m_type == Enums::M_U16 || + m_type == Enums::M_S16 || + m_type == Enums::M_F16) { + size = 2; + } else { + size = 1; + } + + addr_operand = nullptr; + } + + void + init_addr(AddrOperandBase *_addr_operand) + { + addr_operand = _addr_operand; + } + + private: + int size; + AddrOperandBase *addr_operand; + + public: + int getMemOperandSize() { return size; } + AddrOperandBase *getAddressOperand() { return addr_operand; } + }; + + template<typename DestOperandType, typename AddrOperandType> + class LdaInstBase : public HsailGPUStaticInst + { + public: + typename DestOperandType::DestOperand dest; + AddrOperandType addr; + + LdaInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj, + const char *_opcode) + : HsailGPUStaticInst(obj, _opcode) + { + using namespace Brig; + + unsigned op_offs = obj->getOperandPtr(ib->operands, 0); + dest.init(op_offs, obj); + op_offs = obj->getOperandPtr(ib->operands, 1); + addr.init(op_offs, obj); + } + + int numSrcRegOperands() { return(this->addr.isVectorRegister()); } + int numDstRegOperands() { return dest.isVectorRegister(); } + bool isVectorRegister(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + return((operandIndex == 0) ? dest.isVectorRegister() : + this->addr.isVectorRegister()); + } + bool isCondRegister(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + return((operandIndex == 0) ? dest.isCondRegister() : + this->addr.isCondRegister()); + } + bool isScalarRegister(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + return((operandIndex == 0) ? dest.isScalarRegister() : + this->addr.isScalarRegister()); + } + bool isSrcOperand(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (operandIndex > 0) + return(this->addr.isVectorRegister()); + return false; + } + bool isDstOperand(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + return(operandIndex == 0); + } + int getOperandSize(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + return((operandIndex == 0) ? dest.opSize() : + this->addr.opSize()); + } + int getRegisterIndex(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + return((operandIndex == 0) ? dest.regIndex() : + this->addr.regIndex()); + } + int getNumOperands() + { + if (this->addr.isVectorRegister()) + return 2; + return 1; + } + }; + + template<typename DestDataType, typename AddrOperandType> + class LdaInst : + public LdaInstBase<typename DestDataType::OperandType, AddrOperandType>, + public MemInst + { + public: + void generateDisassembly(); + + LdaInst(const Brig::BrigInstBase *ib, const BrigObject *obj, + const char *_opcode) + : LdaInstBase<typename DestDataType::OperandType, + AddrOperandType>(ib, obj, _opcode) + { + init_addr(&this->addr); + } + + void execute(GPUDynInstPtr gpuDynInst); + }; + + template<typename DataType> + GPUStaticInst* + decodeLda(const Brig::BrigInstBase *ib, const BrigObject *obj) + { + unsigned op_offs = obj->getOperandPtr(ib->operands, 1); + BrigRegOperandInfo regDataType = findRegDataType(op_offs, obj); + + if (regDataType.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) { + return new LdaInst<DataType, NoRegAddrOperand>(ib, obj, "ldas"); + } else if (regDataType.kind == Brig::BRIG_KIND_OPERAND_REGISTER) { + // V2/V4 not allowed + switch (regDataType.regKind) { + case Brig::BRIG_REGISTER_KIND_SINGLE: + return new LdaInst<DataType, SRegAddrOperand>(ib, obj, "ldas"); + case Brig::BRIG_REGISTER_KIND_DOUBLE: + return new LdaInst<DataType, DRegAddrOperand>(ib, obj, "ldas"); + default: + fatal("Bad ldas register operand type %d\n", regDataType.type); + } + } else { + fatal("Bad ldas register operand kind %d\n", regDataType.kind); + } + } + + template<typename MemOperandType, typename DestOperandType, + typename AddrOperandType> + class LdInstBase : public HsailGPUStaticInst + { + public: + Brig::BrigWidth8_t width; + typename DestOperandType::DestOperand dest; + AddrOperandType addr; + + Brig::BrigSegment segment; + Brig::BrigMemoryOrder memoryOrder; + Brig::BrigMemoryScope memoryScope; + unsigned int equivClass; + bool isArgLoad() + { + return segment == Brig::BRIG_SEGMENT_KERNARG || + segment == Brig::BRIG_SEGMENT_ARG; + } + void + initLd(const Brig::BrigInstBase *ib, const BrigObject *obj, + const char *_opcode) + { + using namespace Brig; + + const BrigInstMem *ldst = (const BrigInstMem*)ib; + + segment = (BrigSegment)ldst->segment; + memoryOrder = BRIG_MEMORY_ORDER_NONE; + memoryScope = BRIG_MEMORY_SCOPE_NONE; + equivClass = ldst->equivClass; + + switch (segment) { + case BRIG_SEGMENT_GLOBAL: + o_type = Enums::OT_GLOBAL_READ; + break; + + case BRIG_SEGMENT_GROUP: + o_type = Enums::OT_SHARED_READ; + break; + + case BRIG_SEGMENT_PRIVATE: + o_type = Enums::OT_PRIVATE_READ; + break; + + case BRIG_SEGMENT_READONLY: + o_type = Enums::OT_READONLY_READ; + break; + + case BRIG_SEGMENT_SPILL: + o_type = Enums::OT_SPILL_READ; + break; + + case BRIG_SEGMENT_FLAT: + o_type = Enums::OT_FLAT_READ; + break; + + case BRIG_SEGMENT_KERNARG: + o_type = Enums::OT_KERN_READ; + break; + + case BRIG_SEGMENT_ARG: + o_type = Enums::OT_ARG; + break; + + default: + panic("Ld: segment %d not supported\n", segment); + } + + width = ldst->width; + unsigned op_offs = obj->getOperandPtr(ib->operands, 0); + const Brig::BrigOperand *brigOp = obj->getOperand(op_offs); + if (brigOp->kind == BRIG_KIND_OPERAND_REGISTER) + dest.init(op_offs, obj); + + op_offs = obj->getOperandPtr(ib->operands, 1); + addr.init(op_offs, obj); + } + + void + initAtomicLd(const Brig::BrigInstBase *ib, const BrigObject *obj, + const char *_opcode) + { + using namespace Brig; + + const BrigInstAtomic *at = (const BrigInstAtomic*)ib; + + segment = (BrigSegment)at->segment; + memoryOrder = (BrigMemoryOrder)at->memoryOrder; + memoryScope = (BrigMemoryScope)at->memoryScope; + equivClass = 0; + + switch (segment) { + case BRIG_SEGMENT_GLOBAL: + o_type = Enums::OT_GLOBAL_READ; + break; + + case BRIG_SEGMENT_GROUP: + o_type = Enums::OT_SHARED_READ; + break; + + case BRIG_SEGMENT_PRIVATE: + o_type = Enums::OT_PRIVATE_READ; + break; + + case BRIG_SEGMENT_READONLY: + o_type = Enums::OT_READONLY_READ; + break; + + case BRIG_SEGMENT_SPILL: + o_type = Enums::OT_SPILL_READ; + break; + + case BRIG_SEGMENT_FLAT: + o_type = Enums::OT_FLAT_READ; + break; + + case BRIG_SEGMENT_KERNARG: + o_type = Enums::OT_KERN_READ; + break; + + case BRIG_SEGMENT_ARG: + o_type = Enums::OT_ARG; + break; + + default: + panic("Ld: segment %d not supported\n", segment); + } + + width = BRIG_WIDTH_1; + unsigned op_offs = obj->getOperandPtr(ib->operands, 0); + const Brig::BrigOperand *brigOp = obj->getOperand(op_offs); + + if (brigOp->kind == BRIG_KIND_OPERAND_REGISTER) + dest.init(op_offs, obj); + + op_offs = obj->getOperandPtr(ib->operands,1); + addr.init(op_offs, obj); + } + + LdInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj, + const char *_opcode) + : HsailGPUStaticInst(obj, _opcode) + { + using namespace Brig; + + if (ib->opcode == BRIG_OPCODE_LD) { + initLd(ib, obj, _opcode); + } else { + initAtomicLd(ib, obj, _opcode); + } + } + + int numSrcRegOperands() { return(this->addr.isVectorRegister()); } + int numDstRegOperands() { return dest.isVectorRegister(); } + int getNumOperands() + { + if (this->addr.isVectorRegister()) + return 2; + else + return 1; + } + bool isVectorRegister(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + return((operandIndex == 0) ? dest.isVectorRegister() : + this->addr.isVectorRegister()); + } + bool isCondRegister(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + return((operandIndex == 0) ? dest.isCondRegister() : + this->addr.isCondRegister()); + } + bool isScalarRegister(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + return((operandIndex == 0) ? dest.isScalarRegister() : + this->addr.isScalarRegister()); + } + bool isSrcOperand(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (operandIndex > 0) + return(this->addr.isVectorRegister()); + return false; + } + bool isDstOperand(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + return(operandIndex == 0); + } + int getOperandSize(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + return((operandIndex == 0) ? dest.opSize() : + this->addr.opSize()); + } + int getRegisterIndex(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + return((operandIndex == 0) ? dest.regIndex() : + this->addr.regIndex()); + } + }; + + template<typename MemDataType, typename DestDataType, + typename AddrOperandType> + class LdInst : + public LdInstBase<typename MemDataType::CType, + typename DestDataType::OperandType, AddrOperandType>, + public MemInst + { + typename DestDataType::OperandType::DestOperand dest_vect[4]; + uint16_t num_dest_operands; + void generateDisassembly(); + + public: + LdInst(const Brig::BrigInstBase *ib, const BrigObject *obj, + const char *_opcode) + : LdInstBase<typename MemDataType::CType, + typename DestDataType::OperandType, + AddrOperandType>(ib, obj, _opcode), + MemInst(MemDataType::memType) + { + init_addr(&this->addr); + + unsigned op_offs = obj->getOperandPtr(ib->operands,0); + const Brig::BrigOperand *brigOp = obj->getOperand(op_offs); + + if (brigOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST) { + const Brig::BrigOperandOperandList *brigRegVecOp = + (const Brig::BrigOperandOperandList*)brigOp; + + num_dest_operands = + *((unsigned*)obj->getData(brigRegVecOp->elements)) / 4; + + assert(num_dest_operands <= 4); + } else { + num_dest_operands = 1; + } + + if (num_dest_operands > 1) { + assert(brigOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST); + + for (int i = 0; i < num_dest_operands; ++i) { + dest_vect[i].init_from_vect(op_offs, obj, i); + } + } + } + + void + initiateAcc(GPUDynInstPtr gpuDynInst) override + { + typedef typename MemDataType::CType c0; + + gpuDynInst->statusBitVector = gpuDynInst->exec_mask; + + if (num_dest_operands > 1) { + for (int i = 0; i < VSZ; ++i) + if (gpuDynInst->exec_mask[i]) + gpuDynInst->statusVector.push_back(num_dest_operands); + else + gpuDynInst->statusVector.push_back(0); + } + + for (int k = 0; k < num_dest_operands; ++k) { + + c0 *d = &((c0*)gpuDynInst->d_data)[k * VSZ]; + + for (int i = 0; i < VSZ; ++i) { + if (gpuDynInst->exec_mask[i]) { + Addr vaddr = gpuDynInst->addr[i] + k * sizeof(c0); + + if (isLocalMem()) { + // load from shared memory + *d = gpuDynInst->wavefront()->ldsChunk-> + read<c0>(vaddr); + } else { + Request *req = new Request(0, vaddr, sizeof(c0), 0, + gpuDynInst->computeUnit()->masterId(), + 0, gpuDynInst->wfDynId, i); + + gpuDynInst->setRequestFlags(req); + PacketPtr pkt = new Packet(req, MemCmd::ReadReq); + pkt->dataStatic(d); + + if (gpuDynInst->computeUnit()->shader-> + separate_acquire_release && + gpuDynInst->memoryOrder == + Enums::MEMORY_ORDER_SC_ACQUIRE) { + // if this load has acquire semantics, + // set the response continuation function + // to perform an Acquire request + gpuDynInst->execContinuation = + &GPUStaticInst::execLdAcq; + + gpuDynInst->useContinuation = true; + } else { + // the request will be finished when + // the load completes + gpuDynInst->useContinuation = false; + } + // translation is performed in sendRequest() + gpuDynInst->computeUnit()->sendRequest(gpuDynInst, + i, pkt); + } + } + ++d; + } + } + + gpuDynInst->updateStats(); + } + + private: + void + execLdAcq(GPUDynInstPtr gpuDynInst) override + { + // after the load has complete and if the load has acquire + // semantics, issue an acquire request. + if (!isLocalMem()) { + if (gpuDynInst->computeUnit()->shader->separate_acquire_release + && gpuDynInst->memoryOrder == + Enums::MEMORY_ORDER_SC_ACQUIRE) { + gpuDynInst->statusBitVector = VectorMask(1); + gpuDynInst->useContinuation = false; + // create request + Request *req = new Request(0, 0, 0, 0, + gpuDynInst->computeUnit()->masterId(), + 0, gpuDynInst->wfDynId, -1); + req->setFlags(Request::ACQUIRE); + gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req); + } + } + } + + public: + bool + isLocalMem() const override + { + return this->segment == Brig::BRIG_SEGMENT_GROUP; + } + + bool isVectorRegister(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if ((num_dest_operands != getNumOperands()) && + (operandIndex == (getNumOperands()-1))) + return(this->addr.isVectorRegister()); + if (num_dest_operands > 1) { + return dest_vect[operandIndex].isVectorRegister(); + } + else if (num_dest_operands == 1) { + return LdInstBase<typename MemDataType::CType, + typename DestDataType::OperandType, + AddrOperandType>::dest.isVectorRegister(); + } + return false; + } + bool isCondRegister(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if ((num_dest_operands != getNumOperands()) && + (operandIndex == (getNumOperands()-1))) + return(this->addr.isCondRegister()); + if (num_dest_operands > 1) + return dest_vect[operandIndex].isCondRegister(); + else if (num_dest_operands == 1) + return LdInstBase<typename MemDataType::CType, + typename DestDataType::OperandType, + AddrOperandType>::dest.isCondRegister(); + return false; + } + bool isScalarRegister(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if ((num_dest_operands != getNumOperands()) && + (operandIndex == (getNumOperands()-1))) + return(this->addr.isScalarRegister()); + if (num_dest_operands > 1) + return dest_vect[operandIndex].isScalarRegister(); + else if (num_dest_operands == 1) + return LdInstBase<typename MemDataType::CType, + typename DestDataType::OperandType, + AddrOperandType>::dest.isScalarRegister(); + return false; + } + bool isSrcOperand(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if ((num_dest_operands != getNumOperands()) && + (operandIndex == (getNumOperands()-1))) + return(this->addr.isVectorRegister()); + return false; + } + bool isDstOperand(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if ((num_dest_operands != getNumOperands()) && + (operandIndex == (getNumOperands()-1))) + return false; + return true; + } + int getOperandSize(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if ((num_dest_operands != getNumOperands()) && + (operandIndex == (getNumOperands()-1))) + return(this->addr.opSize()); + if (num_dest_operands > 1) + return(dest_vect[operandIndex].opSize()); + else if (num_dest_operands == 1) + return(LdInstBase<typename MemDataType::CType, + typename DestDataType::OperandType, + AddrOperandType>::dest.opSize()); + return 0; + } + int getRegisterIndex(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if ((num_dest_operands != getNumOperands()) && + (operandIndex == (getNumOperands()-1))) + return(this->addr.regIndex()); + if (num_dest_operands > 1) + return(dest_vect[operandIndex].regIndex()); + else if (num_dest_operands == 1) + return(LdInstBase<typename MemDataType::CType, + typename DestDataType::OperandType, + AddrOperandType>::dest.regIndex()); + return -1; + } + int getNumOperands() + { + if (this->addr.isVectorRegister() || this->addr.isScalarRegister()) + return(num_dest_operands+1); + else + return(num_dest_operands); + } + void execute(GPUDynInstPtr gpuDynInst); + }; + + template<typename MemDT, typename DestDT> + GPUStaticInst* + decodeLd2(const Brig::BrigInstBase *ib, const BrigObject *obj) + { + unsigned op_offs = obj->getOperandPtr(ib->operands,1); + BrigRegOperandInfo tmp = findRegDataType(op_offs, obj); + + if (tmp.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) { + return new LdInst<MemDT, DestDT, NoRegAddrOperand>(ib, obj, "ld"); + } else if (tmp.kind == Brig::BRIG_KIND_OPERAND_REGISTER || + tmp.kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST) { + switch (tmp.regKind) { + case Brig::BRIG_REGISTER_KIND_SINGLE: + return new LdInst<MemDT, DestDT, + SRegAddrOperand>(ib, obj, "ld"); + case Brig::BRIG_REGISTER_KIND_DOUBLE: + return new LdInst<MemDT, DestDT, + DRegAddrOperand>(ib, obj, "ld"); + default: + fatal("Bad ld register operand type %d\n", tmp.regKind); + } + } else { + fatal("Bad ld register operand kind %d\n", tmp.kind); + } + } + + template<typename MemDT> + GPUStaticInst* + decodeLd(const Brig::BrigInstBase *ib, const BrigObject *obj) + { + unsigned op_offs = obj->getOperandPtr(ib->operands,0); + BrigRegOperandInfo dest = findRegDataType(op_offs, obj); + + assert(dest.kind == Brig::BRIG_KIND_OPERAND_REGISTER || + dest.kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST); + switch(dest.regKind) { + case Brig::BRIG_REGISTER_KIND_SINGLE: + switch (ib->type) { + case Brig::BRIG_TYPE_B8: + case Brig::BRIG_TYPE_B16: + case Brig::BRIG_TYPE_B32: + return decodeLd2<MemDT, B32>(ib, obj); + case Brig::BRIG_TYPE_U8: + case Brig::BRIG_TYPE_U16: + case Brig::BRIG_TYPE_U32: + return decodeLd2<MemDT, U32>(ib, obj); + case Brig::BRIG_TYPE_S8: + case Brig::BRIG_TYPE_S16: + case Brig::BRIG_TYPE_S32: + return decodeLd2<MemDT, S32>(ib, obj); + case Brig::BRIG_TYPE_F16: + case Brig::BRIG_TYPE_F32: + return decodeLd2<MemDT, U32>(ib, obj); + default: + fatal("Bad ld register operand type %d, %d\n", + dest.regKind, ib->type); + }; + case Brig::BRIG_REGISTER_KIND_DOUBLE: + switch (ib->type) { + case Brig::BRIG_TYPE_B64: + return decodeLd2<MemDT, B64>(ib, obj); + case Brig::BRIG_TYPE_U64: + return decodeLd2<MemDT, U64>(ib, obj); + case Brig::BRIG_TYPE_S64: + return decodeLd2<MemDT, S64>(ib, obj); + case Brig::BRIG_TYPE_F64: + return decodeLd2<MemDT, U64>(ib, obj); + default: + fatal("Bad ld register operand type %d, %d\n", + dest.regKind, ib->type); + }; + default: + fatal("Bad ld register operand type %d, %d\n", dest.regKind, + ib->type); + } + } + + template<typename MemDataType, typename SrcOperandType, + typename AddrOperandType> + class StInstBase : public HsailGPUStaticInst + { + public: + typename SrcOperandType::SrcOperand src; + AddrOperandType addr; + + Brig::BrigSegment segment; + Brig::BrigMemoryScope memoryScope; + Brig::BrigMemoryOrder memoryOrder; + unsigned int equivClass; + + void + initSt(const Brig::BrigInstBase *ib, const BrigObject *obj, + const char *_opcode) + { + using namespace Brig; + + const BrigInstMem *ldst = (const BrigInstMem*)ib; + + segment = (BrigSegment)ldst->segment; + memoryOrder = BRIG_MEMORY_ORDER_NONE; + memoryScope = BRIG_MEMORY_SCOPE_NONE; + equivClass = ldst->equivClass; + + switch (segment) { + case BRIG_SEGMENT_GLOBAL: + o_type = Enums::OT_GLOBAL_WRITE; + break; + + case BRIG_SEGMENT_GROUP: + o_type = Enums::OT_SHARED_WRITE; + break; + + case BRIG_SEGMENT_PRIVATE: + o_type = Enums::OT_PRIVATE_WRITE; + break; + + case BRIG_SEGMENT_READONLY: + o_type = Enums::OT_READONLY_WRITE; + break; + + case BRIG_SEGMENT_SPILL: + o_type = Enums::OT_SPILL_WRITE; + break; + + case BRIG_SEGMENT_FLAT: + o_type = Enums::OT_FLAT_WRITE; + break; + + case BRIG_SEGMENT_ARG: + o_type = Enums::OT_ARG; + break; + + default: + panic("St: segment %d not supported\n", segment); + } + + unsigned op_offs = obj->getOperandPtr(ib->operands, 0); + const BrigOperand *baseOp = obj->getOperand(op_offs); + + if ((baseOp->kind == BRIG_KIND_OPERAND_CONSTANT_BYTES) || + (baseOp->kind == BRIG_KIND_OPERAND_REGISTER)) { + src.init(op_offs, obj); + } + + op_offs = obj->getOperandPtr(ib->operands, 1); + addr.init(op_offs, obj); + } + + void + initAtomicSt(const Brig::BrigInstBase *ib, const BrigObject *obj, + const char *_opcode) + { + using namespace Brig; + + const BrigInstAtomic *at = (const BrigInstAtomic*)ib; + + segment = (BrigSegment)at->segment; + memoryScope = (BrigMemoryScope)at->memoryScope; + memoryOrder = (BrigMemoryOrder)at->memoryOrder; + equivClass = 0; + + switch (segment) { + case BRIG_SEGMENT_GLOBAL: + o_type = Enums::OT_GLOBAL_WRITE; + break; + + case BRIG_SEGMENT_GROUP: + o_type = Enums::OT_SHARED_WRITE; + break; + + case BRIG_SEGMENT_PRIVATE: + o_type = Enums::OT_PRIVATE_WRITE; + break; + + case BRIG_SEGMENT_READONLY: + o_type = Enums::OT_READONLY_WRITE; + break; + + case BRIG_SEGMENT_SPILL: + o_type = Enums::OT_SPILL_WRITE; + break; + + case BRIG_SEGMENT_FLAT: + o_type = Enums::OT_FLAT_WRITE; + break; + + case BRIG_SEGMENT_ARG: + o_type = Enums::OT_ARG; + break; + + default: + panic("St: segment %d not supported\n", segment); + } + + unsigned op_offs = obj->getOperandPtr(ib->operands, 0); + addr.init(op_offs, obj); + + op_offs = obj->getOperandPtr(ib->operands, 1); + src.init(op_offs, obj); + } + + StInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj, + const char *_opcode) + : HsailGPUStaticInst(obj, _opcode) + { + using namespace Brig; + + if (ib->opcode == BRIG_OPCODE_ST) { + initSt(ib, obj, _opcode); + } else { + initAtomicSt(ib, obj, _opcode); + } + } + + int numDstRegOperands() { return 0; } + int numSrcRegOperands() + { + return src.isVectorRegister() + this->addr.isVectorRegister(); + } + int getNumOperands() + { + if (this->addr.isVectorRegister() || this->addr.isScalarRegister()) + return 2; + else + return 1; + } + bool isVectorRegister(int operandIndex) + { + assert(operandIndex >= 0 && operandIndex < getNumOperands()); + return !operandIndex ? src.isVectorRegister() : + this->addr.isVectorRegister(); + } + bool isCondRegister(int operandIndex) + { + assert(operandIndex >= 0 && operandIndex < getNumOperands()); + return !operandIndex ? src.isCondRegister() : + this->addr.isCondRegister(); + } + bool isScalarRegister(int operandIndex) + { + assert(operandIndex >= 0 && operandIndex < getNumOperands()); + return !operandIndex ? src.isScalarRegister() : + this->addr.isScalarRegister(); + } + bool isSrcOperand(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + return true; + } + bool isDstOperand(int operandIndex) { return false; } + int getOperandSize(int operandIndex) + { + assert(operandIndex >= 0 && operandIndex < getNumOperands()); + return !operandIndex ? src.opSize() : this->addr.opSize(); + } + int getRegisterIndex(int operandIndex) + { + assert(operandIndex >= 0 && operandIndex < getNumOperands()); + return !operandIndex ? src.regIndex() : this->addr.regIndex(); + } + }; + + + template<typename MemDataType, typename SrcDataType, + typename AddrOperandType> + class StInst : + public StInstBase<MemDataType, typename SrcDataType::OperandType, + AddrOperandType>, + public MemInst + { + public: + typename SrcDataType::OperandType::SrcOperand src_vect[4]; + uint16_t num_src_operands; + void generateDisassembly(); + + StInst(const Brig::BrigInstBase *ib, const BrigObject *obj, + const char *_opcode, int srcIdx) + : StInstBase<MemDataType, typename SrcDataType::OperandType, + AddrOperandType>(ib, obj, _opcode), + MemInst(SrcDataType::memType) + { + init_addr(&this->addr); + + BrigRegOperandInfo rinfo; + unsigned op_offs = obj->getOperandPtr(ib->operands,srcIdx); + const Brig::BrigOperand *baseOp = obj->getOperand(op_offs); + + if (baseOp->kind == Brig::BRIG_KIND_OPERAND_CONSTANT_BYTES) { + const Brig::BrigOperandConstantBytes *op = + (Brig::BrigOperandConstantBytes*)baseOp; + + rinfo = BrigRegOperandInfo((Brig::BrigKind16_t)op->base.kind, + Brig::BRIG_TYPE_NONE); + } else { + rinfo = findRegDataType(op_offs, obj); + } + + if (baseOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST) { + const Brig::BrigOperandOperandList *brigRegVecOp = + (const Brig::BrigOperandOperandList*)baseOp; + + num_src_operands = + *((unsigned*)obj->getData(brigRegVecOp->elements)) / 4; + + assert(num_src_operands <= 4); + } else { + num_src_operands = 1; + } + + if (num_src_operands > 1) { + assert(baseOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST); + + for (int i = 0; i < num_src_operands; ++i) { + src_vect[i].init_from_vect(op_offs, obj, i); + } + } + } + + void + initiateAcc(GPUDynInstPtr gpuDynInst) override + { + // before performing a store, check if this store has + // release semantics, and if so issue a release first + if (!isLocalMem()) { + if (gpuDynInst->computeUnit()->shader->separate_acquire_release + && gpuDynInst->memoryOrder == + Enums::MEMORY_ORDER_SC_RELEASE) { + + gpuDynInst->statusBitVector = VectorMask(1); + gpuDynInst->execContinuation = &GPUStaticInst::execSt; + gpuDynInst->useContinuation = true; + // create request + Request *req = new Request(0, 0, 0, 0, + gpuDynInst->computeUnit()->masterId(), + 0, gpuDynInst->wfDynId, -1); + req->setFlags(Request::RELEASE); + gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req); + + return; + } + } + + // if there is no release semantic, perform stores immediately + execSt(gpuDynInst); + } + + bool + isLocalMem() const override + { + return this->segment == Brig::BRIG_SEGMENT_GROUP; + } + + private: + // execSt may be called through a continuation + // if the store had release semantics. see comment for + // execSt in gpu_static_inst.hh + void + execSt(GPUDynInstPtr gpuDynInst) override + { + typedef typename MemDataType::CType c0; + + gpuDynInst->statusBitVector = gpuDynInst->exec_mask; + + if (num_src_operands > 1) { + for (int i = 0; i < VSZ; ++i) + if (gpuDynInst->exec_mask[i]) + gpuDynInst->statusVector.push_back(num_src_operands); + else + gpuDynInst->statusVector.push_back(0); + } + + for (int k = 0; k < num_src_operands; ++k) { + c0 *d = &((c0*)gpuDynInst->d_data)[k * VSZ]; + + for (int i = 0; i < VSZ; ++i) { + if (gpuDynInst->exec_mask[i]) { + Addr vaddr = gpuDynInst->addr[i] + k * sizeof(c0); + + if (isLocalMem()) { + //store to shared memory + gpuDynInst->wavefront()->ldsChunk->write<c0>(vaddr, + *d); + } else { + Request *req = + new Request(0, vaddr, sizeof(c0), 0, + gpuDynInst->computeUnit()->masterId(), + 0, gpuDynInst->wfDynId, i); + + gpuDynInst->setRequestFlags(req); + PacketPtr pkt = new Packet(req, MemCmd::WriteReq); + pkt->dataStatic<c0>(d); + + // translation is performed in sendRequest() + // the request will be finished when the store completes + gpuDynInst->useContinuation = false; + gpuDynInst->computeUnit()->sendRequest(gpuDynInst, + i, pkt); + + } + } + ++d; + } + } + + gpuDynInst->updateStats(); + } + + public: + bool isVectorRegister(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (operandIndex == num_src_operands) + return this->addr.isVectorRegister(); + if (num_src_operands > 1) + return src_vect[operandIndex].isVectorRegister(); + else if (num_src_operands == 1) + return StInstBase<MemDataType, + typename SrcDataType::OperandType, + AddrOperandType>::src.isVectorRegister(); + return false; + } + bool isCondRegister(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (operandIndex == num_src_operands) + return this->addr.isCondRegister(); + if (num_src_operands > 1) + return src_vect[operandIndex].isCondRegister(); + else if (num_src_operands == 1) + return StInstBase<MemDataType, + typename SrcDataType::OperandType, + AddrOperandType>::src.isCondRegister(); + return false; + } + bool isScalarRegister(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (operandIndex == num_src_operands) + return this->addr.isScalarRegister(); + if (num_src_operands > 1) + return src_vect[operandIndex].isScalarRegister(); + else if (num_src_operands == 1) + return StInstBase<MemDataType, + typename SrcDataType::OperandType, + AddrOperandType>::src.isScalarRegister(); + return false; + } + bool isSrcOperand(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + return true; + } + bool isDstOperand(int operandIndex) { return false; } + int getOperandSize(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (operandIndex == num_src_operands) + return this->addr.opSize(); + if (num_src_operands > 1) + return src_vect[operandIndex].opSize(); + else if (num_src_operands == 1) + return StInstBase<MemDataType, + typename SrcDataType::OperandType, + AddrOperandType>::src.opSize(); + return 0; + } + int getRegisterIndex(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (operandIndex == num_src_operands) + return this->addr.regIndex(); + if (num_src_operands > 1) + return src_vect[operandIndex].regIndex(); + else if (num_src_operands == 1) + return StInstBase<MemDataType, + typename SrcDataType::OperandType, + AddrOperandType>::src.regIndex(); + return -1; + } + int getNumOperands() + { + if (this->addr.isVectorRegister() || this->addr.isScalarRegister()) + return num_src_operands + 1; + else + return num_src_operands; + } + void execute(GPUDynInstPtr gpuDynInst); + }; + + template<typename DataType, typename SrcDataType> + GPUStaticInst* + decodeSt(const Brig::BrigInstBase *ib, const BrigObject *obj) + { + int srcIdx = 0; + int destIdx = 1; + if (ib->opcode == Brig::BRIG_OPCODE_ATOMIC || + ib->opcode == Brig::BRIG_OPCODE_ATOMICNORET) { + srcIdx = 1; + destIdx = 0; + } + unsigned op_offs = obj->getOperandPtr(ib->operands,destIdx); + + BrigRegOperandInfo tmp = findRegDataType(op_offs, obj); + + if (tmp.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) { + return new StInst<DataType, SrcDataType, + NoRegAddrOperand>(ib, obj, "st", srcIdx); + } else if (tmp.kind == Brig::BRIG_KIND_OPERAND_REGISTER) { + // V2/V4 not allowed + switch (tmp.regKind) { + case Brig::BRIG_REGISTER_KIND_SINGLE: + return new StInst<DataType, SrcDataType, + SRegAddrOperand>(ib, obj, "st", srcIdx); + case Brig::BRIG_REGISTER_KIND_DOUBLE: + return new StInst<DataType, SrcDataType, + DRegAddrOperand>(ib, obj, "st", srcIdx); + default: + fatal("Bad st register operand type %d\n", tmp.type); + } + } else { + fatal("Bad st register operand kind %d\n", tmp.kind); + } + } + + Enums::MemOpType brigAtomicToMemOpType(Brig::BrigOpcode brigOpCode, + Brig::BrigAtomicOperation brigOp); + + template<typename OperandType, typename AddrOperandType, int NumSrcOperands, + bool HasDst> + class AtomicInstBase : public HsailGPUStaticInst + { + public: + typename OperandType::DestOperand dest; + typename OperandType::SrcOperand src[NumSrcOperands]; + AddrOperandType addr; + + Brig::BrigSegment segment; + Brig::BrigMemoryOrder memoryOrder; + Brig::BrigAtomicOperation atomicOperation; + Brig::BrigMemoryScope memoryScope; + Brig::BrigOpcode opcode; + Enums::MemOpType opType; + + AtomicInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj, + const char *_opcode) + : HsailGPUStaticInst(obj, _opcode) + { + using namespace Brig; + + const BrigInstAtomic *at = (const BrigInstAtomic*)ib; + + segment = (BrigSegment)at->segment; + memoryScope = (BrigMemoryScope)at->memoryScope; + memoryOrder = (BrigMemoryOrder)at->memoryOrder; + atomicOperation = (BrigAtomicOperation)at->atomicOperation; + opcode = (BrigOpcode)ib->opcode; + opType = brigAtomicToMemOpType(opcode, atomicOperation); + + switch (segment) { + case BRIG_SEGMENT_GLOBAL: + o_type = Enums::OT_GLOBAL_ATOMIC; + break; + + case BRIG_SEGMENT_GROUP: + o_type = Enums::OT_SHARED_ATOMIC; + break; + + case BRIG_SEGMENT_FLAT: + o_type = Enums::OT_FLAT_ATOMIC; + break; + + default: + panic("Atomic: segment %d not supported\n", segment); + } + + if (HasDst) { + unsigned op_offs = obj->getOperandPtr(ib->operands, 0); + dest.init(op_offs, obj); + + op_offs = obj->getOperandPtr(ib->operands, 1); + addr.init(op_offs, obj); + + for (int i = 0; i < NumSrcOperands; ++i) { + op_offs = obj->getOperandPtr(ib->operands, i + 2); + src[i].init(op_offs, obj); + } + } else { + + unsigned op_offs = obj->getOperandPtr(ib->operands, 0); + addr.init(op_offs, obj); + + for (int i = 0; i < NumSrcOperands; ++i) { + op_offs = obj->getOperandPtr(ib->operands, i + 1); + src[i].init(op_offs, obj); + } + } + } + + int numSrcRegOperands() + { + int operands = 0; + for (int i = 0; i < NumSrcOperands; i++) { + if (src[i].isVectorRegister() == true) { + operands++; + } + } + if (addr.isVectorRegister()) + operands++; + return operands; + } + int numDstRegOperands() { return dest.isVectorRegister(); } + int getNumOperands() + { + if (addr.isVectorRegister()) + return(NumSrcOperands + 2); + return(NumSrcOperands + 1); + } + bool isVectorRegister(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (operandIndex < NumSrcOperands) + return src[operandIndex].isVectorRegister(); + else if (operandIndex == NumSrcOperands) + return(addr.isVectorRegister()); + else + return dest.isVectorRegister(); + } + bool isCondRegister(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (operandIndex < NumSrcOperands) + return src[operandIndex].isCondRegister(); + else if (operandIndex == NumSrcOperands) + return(addr.isCondRegister()); + else + return dest.isCondRegister(); + } + bool isScalarRegister(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (operandIndex < NumSrcOperands) + return src[operandIndex].isScalarRegister(); + else if (operandIndex == NumSrcOperands) + return(addr.isScalarRegister()); + else + return dest.isScalarRegister(); + } + bool isSrcOperand(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (operandIndex < NumSrcOperands) + return true; + else if (operandIndex == NumSrcOperands) + return(addr.isVectorRegister()); + else + return false; + } + bool isDstOperand(int operandIndex) + { + if (operandIndex <= NumSrcOperands) + return false; + else + return true; + } + int getOperandSize(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (operandIndex < NumSrcOperands) + return(src[operandIndex].opSize()); + else if (operandIndex == NumSrcOperands) + return(addr.opSize()); + else + return(dest.opSize()); + } + int getRegisterIndex(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (operandIndex < NumSrcOperands) + return(src[operandIndex].regIndex()); + else if (operandIndex == NumSrcOperands) + return(addr.regIndex()); + else + return(dest.regIndex()); + return -1; + } + }; + + template<typename MemDataType, typename AddrOperandType, int NumSrcOperands, + bool HasDst> + class AtomicInst : + public AtomicInstBase<typename MemDataType::OperandType, + AddrOperandType, NumSrcOperands, HasDst>, + public MemInst + { + public: + void generateDisassembly(); + + AtomicInst(const Brig::BrigInstBase *ib, const BrigObject *obj, + const char *_opcode) + : AtomicInstBase<typename MemDataType::OperandType, AddrOperandType, + NumSrcOperands, HasDst> + (ib, obj, _opcode), + MemInst(MemDataType::memType) + { + init_addr(&this->addr); + } + + void + initiateAcc(GPUDynInstPtr gpuDynInst) override + { + // before doing the RMW, check if this atomic has + // release semantics, and if so issue a release first + if (!isLocalMem()) { + if (gpuDynInst->computeUnit()->shader->separate_acquire_release + && (gpuDynInst->memoryOrder == + Enums::MEMORY_ORDER_SC_RELEASE || gpuDynInst->memoryOrder == + Enums::MEMORY_ORDER_SC_ACQUIRE_RELEASE)) { + + gpuDynInst->statusBitVector = VectorMask(1); + + gpuDynInst->execContinuation = &GPUStaticInst::execAtomic; + gpuDynInst->useContinuation = true; + + // create request + Request *req = new Request(0, 0, 0, 0, + gpuDynInst->computeUnit()->masterId(), + 0, gpuDynInst->wfDynId, -1); + req->setFlags(Request::RELEASE); + gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req); + + return; + } + } + + // if there is no release semantic, execute the RMW immediately + execAtomic(gpuDynInst); + + } + + void execute(GPUDynInstPtr gpuDynInst); + + bool + isLocalMem() const override + { + return this->segment == Brig::BRIG_SEGMENT_GROUP; + } + + private: + // execAtomic may be called through a continuation + // if the RMW had release semantics. see comment for + // execContinuation in gpu_dyn_inst.hh + void + execAtomic(GPUDynInstPtr gpuDynInst) override + { + gpuDynInst->statusBitVector = gpuDynInst->exec_mask; + + typedef typename MemDataType::CType c0; + + c0 *d = &((c0*) gpuDynInst->d_data)[0]; + c0 *e = &((c0*) gpuDynInst->a_data)[0]; + c0 *f = &((c0*) gpuDynInst->x_data)[0]; + + for (int i = 0; i < VSZ; ++i) { + if (gpuDynInst->exec_mask[i]) { + Addr vaddr = gpuDynInst->addr[i]; + + if (isLocalMem()) { + Wavefront *wavefront = gpuDynInst->wavefront(); + *d = wavefront->ldsChunk->read<c0>(vaddr); + + switch (this->opType) { + case Enums::MO_AADD: + case Enums::MO_ANRADD: + wavefront->ldsChunk->write<c0>(vaddr, + wavefront->ldsChunk->read<c0>(vaddr) + (*e)); + break; + case Enums::MO_ASUB: + case Enums::MO_ANRSUB: + wavefront->ldsChunk->write<c0>(vaddr, + wavefront->ldsChunk->read<c0>(vaddr) - (*e)); + break; + case Enums::MO_AMAX: + case Enums::MO_ANRMAX: + wavefront->ldsChunk->write<c0>(vaddr, + std::max(wavefront->ldsChunk->read<c0>(vaddr), + (*e))); + break; + case Enums::MO_AMIN: + case Enums::MO_ANRMIN: + wavefront->ldsChunk->write<c0>(vaddr, + std::min(wavefront->ldsChunk->read<c0>(vaddr), + (*e))); + break; + case Enums::MO_AAND: + case Enums::MO_ANRAND: + wavefront->ldsChunk->write<c0>(vaddr, + wavefront->ldsChunk->read<c0>(vaddr) & (*e)); + break; + case Enums::MO_AOR: + case Enums::MO_ANROR: + wavefront->ldsChunk->write<c0>(vaddr, + wavefront->ldsChunk->read<c0>(vaddr) | (*e)); + break; + case Enums::MO_AXOR: + case Enums::MO_ANRXOR: + wavefront->ldsChunk->write<c0>(vaddr, + wavefront->ldsChunk->read<c0>(vaddr) ^ (*e)); + break; + case Enums::MO_AINC: + case Enums::MO_ANRINC: + wavefront->ldsChunk->write<c0>(vaddr, + wavefront->ldsChunk->read<c0>(vaddr) + 1); + break; + case Enums::MO_ADEC: + case Enums::MO_ANRDEC: + wavefront->ldsChunk->write<c0>(vaddr, + wavefront->ldsChunk->read<c0>(vaddr) - 1); + break; + case Enums::MO_AEXCH: + case Enums::MO_ANREXCH: + wavefront->ldsChunk->write<c0>(vaddr, (*e)); + break; + case Enums::MO_ACAS: + case Enums::MO_ANRCAS: + wavefront->ldsChunk->write<c0>(vaddr, + (wavefront->ldsChunk->read<c0>(vaddr) == (*e)) ? + (*f) : wavefront->ldsChunk->read<c0>(vaddr)); + break; + default: + fatal("Unrecognized or invalid HSAIL atomic op " + "type.\n"); + break; + } + } else { + Request *req = + new Request(0, vaddr, sizeof(c0), 0, + gpuDynInst->computeUnit()->masterId(), + 0, gpuDynInst->wfDynId, i, + gpuDynInst->makeAtomicOpFunctor<c0>(e, + f, this->opType)); + + gpuDynInst->setRequestFlags(req); + PacketPtr pkt = new Packet(req, MemCmd::SwapReq); + pkt->dataStatic(d); + + if (gpuDynInst->computeUnit()->shader-> + separate_acquire_release && + (gpuDynInst->memoryOrder == + Enums::MEMORY_ORDER_SC_ACQUIRE)) { + // if this atomic has acquire semantics, + // schedule the continuation to perform an + // acquire after the RMW completes + gpuDynInst->execContinuation = + &GPUStaticInst::execAtomicAcq; + + gpuDynInst->useContinuation = true; + } else { + // the request will be finished when the RMW completes + gpuDynInst->useContinuation = false; + } + // translation is performed in sendRequest() + gpuDynInst->computeUnit()->sendRequest(gpuDynInst, i, + pkt); + } + } + + ++d; + ++e; + ++f; + } + + gpuDynInst->updateStats(); + } + + // execAtomicACq will always be called through a continuation. + // see comment for execContinuation in gpu_dyn_inst.hh + void + execAtomicAcq(GPUDynInstPtr gpuDynInst) override + { + // after performing the RMW, check to see if this instruction + // has acquire semantics, and if so, issue an acquire + if (!isLocalMem()) { + if (gpuDynInst->computeUnit()->shader->separate_acquire_release + && gpuDynInst->memoryOrder == + Enums::MEMORY_ORDER_SC_ACQUIRE) { + gpuDynInst->statusBitVector = VectorMask(1); + + // the request will be finished when + // the acquire completes + gpuDynInst->useContinuation = false; + // create request + Request *req = new Request(0, 0, 0, 0, + gpuDynInst->computeUnit()->masterId(), + 0, gpuDynInst->wfDynId, -1); + req->setFlags(Request::ACQUIRE); + gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req); + } + } + } + }; + + template<typename DataType, typename AddrOperandType, int NumSrcOperands> + GPUStaticInst* + constructAtomic(const Brig::BrigInstBase *ib, const BrigObject *obj) + { + const Brig::BrigInstAtomic *at = (const Brig::BrigInstAtomic*)ib; + + if (at->atomicOperation == Brig::BRIG_ATOMIC_LD) { + return decodeLd<DataType>(ib, obj); + } else if (at->atomicOperation == Brig::BRIG_ATOMIC_ST) { + switch (ib->type) { + case Brig::BRIG_TYPE_B8: + return decodeSt<S8,S8>(ib, obj); + case Brig::BRIG_TYPE_B16: + return decodeSt<S8,S16>(ib, obj); + case Brig::BRIG_TYPE_B32: + return decodeSt<S8,S32>(ib, obj); + case Brig::BRIG_TYPE_B64: + return decodeSt<S8,S64>(ib, obj); + default: fatal("AtomicSt: Operand type mismatch %d\n", ib->type); + } + } else { + if ((Brig::BrigOpcode)ib->opcode == Brig::BRIG_OPCODE_ATOMICNORET) + return new AtomicInst<DataType, AddrOperandType, + NumSrcOperands, false>(ib, obj, "atomicnoret"); + else + return new AtomicInst<DataType, AddrOperandType, + NumSrcOperands, true>(ib, obj, "atomic"); + } + } + + template<typename DataType, int NumSrcOperands> + GPUStaticInst* + decodeAtomicHelper(const Brig::BrigInstBase *ib, const BrigObject *obj) + { + unsigned addrIndex = (Brig::BrigOpcode)ib->opcode == + Brig::BRIG_OPCODE_ATOMICNORET ? 0 : 1; + + unsigned op_offs = obj->getOperandPtr(ib->operands,addrIndex); + + BrigRegOperandInfo tmp = findRegDataType(op_offs, obj); + + if (tmp.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) { + return constructAtomic<DataType, NoRegAddrOperand, + NumSrcOperands>(ib, obj); + } else if (tmp.kind == Brig::BRIG_KIND_OPERAND_REGISTER) { + // V2/V4 not allowed + switch (tmp.regKind) { + case Brig::BRIG_REGISTER_KIND_SINGLE: + return constructAtomic<DataType, SRegAddrOperand, + NumSrcOperands>(ib, obj); + case Brig::BRIG_REGISTER_KIND_DOUBLE: + return constructAtomic<DataType, DRegAddrOperand, + NumSrcOperands>(ib, obj); + default: + fatal("Bad atomic register operand type %d\n", tmp.type); + } + } else { + fatal("Bad atomic register operand kind %d\n", tmp.kind); + } + } + + + template<typename DataType> + GPUStaticInst* + decodeAtomic(const Brig::BrigInstBase *ib, const BrigObject *obj) + { + const Brig::BrigInstAtomic *at = (const Brig::BrigInstAtomic*)ib; + + if (at->atomicOperation == Brig::BRIG_ATOMIC_CAS) { + return decodeAtomicHelper<DataType, 2>(ib, obj); + } else { + return decodeAtomicHelper<DataType, 1>(ib, obj); + } + } + + template<typename DataType> + GPUStaticInst* + decodeAtomicNoRet(const Brig::BrigInstBase *ib, const BrigObject *obj) + { + const Brig::BrigInstAtomic *at = (const Brig::BrigInstAtomic*)ib; + if (at->atomicOperation == Brig::BRIG_ATOMIC_CAS) { + return decodeAtomicHelper<DataType, 2>(ib, obj); + } else { + return decodeAtomicHelper<DataType, 1>(ib, obj); + } + } +} // namespace HsailISA + +#endif // __ARCH_HSAIL_INSTS_MEM_HH__ diff --git a/src/arch/hsail/insts/mem_impl.hh b/src/arch/hsail/insts/mem_impl.hh new file mode 100644 index 000000000..94f0cd6aa --- /dev/null +++ b/src/arch/hsail/insts/mem_impl.hh @@ -0,0 +1,660 @@ +/* + * Copyright (c) 2012-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Steve Reinhardt + */ + +#include "arch/hsail/generic_types.hh" +#include "gpu-compute/hsail_code.hh" + +// defined in code.cc, but not worth sucking in all of code.h for this +// at this point +extern const char *segmentNames[]; + +namespace HsailISA +{ + template<typename DestDataType, typename AddrRegOperandType> + void + LdaInst<DestDataType, AddrRegOperandType>::generateDisassembly() + { + this->disassembly = csprintf("%s_%s %s,%s", this->opcode, + DestDataType::label, + this->dest.disassemble(), + this->addr.disassemble()); + } + + template<typename DestDataType, typename AddrRegOperandType> + void + LdaInst<DestDataType, AddrRegOperandType>::execute(GPUDynInstPtr gpuDynInst) + { + Wavefront *w = gpuDynInst->wavefront(); + + typedef typename DestDataType::CType CType M5_VAR_USED; + const VectorMask &mask = w->get_pred(); + uint64_t addr_vec[VSZ]; + this->addr.calcVector(w, addr_vec); + + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + this->dest.set(w, lane, addr_vec[lane]); + } + } + } + + template<typename MemDataType, typename DestDataType, + typename AddrRegOperandType> + void + LdInst<MemDataType, DestDataType, AddrRegOperandType>::generateDisassembly() + { + switch (num_dest_operands) { + case 1: + this->disassembly = csprintf("%s_%s_%s %s,%s", this->opcode, + segmentNames[this->segment], + MemDataType::label, + this->dest.disassemble(), + this->addr.disassemble()); + break; + case 2: + this->disassembly = csprintf("%s_%s_%s (%s,%s), %s", this->opcode, + segmentNames[this->segment], + MemDataType::label, + this->dest_vect[0].disassemble(), + this->dest_vect[1].disassemble(), + this->addr.disassemble()); + break; + case 4: + this->disassembly = csprintf("%s_%s_%s (%s,%s,%s,%s), %s", + this->opcode, + segmentNames[this->segment], + MemDataType::label, + this->dest_vect[0].disassemble(), + this->dest_vect[1].disassemble(), + this->dest_vect[2].disassemble(), + this->dest_vect[3].disassemble(), + this->addr.disassemble()); + break; + default: + fatal("Bad ld register dest operand, num vector operands: %d \n", + num_dest_operands); + break; + } + } + + static Addr + calcPrivAddr(Addr addr, Wavefront *w, int lane, GPUStaticInst *i) + { + // what is the size of the object we are accessing?? + // NOTE: the compiler doesn't generate enough information + // to do this yet..have to just line up all the private + // work-item spaces back to back for now + /* + StorageElement* se = + i->parent->findSymbol(Brig::BrigPrivateSpace, addr); + assert(se); + + return w->wfSlotId * w->privSizePerItem * VSZ + + se->offset * VSZ + + lane * se->size; + */ + + // addressing strategy: interleave the private spaces of + // work-items in a wave-front on 8 byte granularity. + // this won't be perfect coalescing like the spill space + // strategy, but it's better than nothing. The spill space + // strategy won't work with private because the same address + // may be accessed by different sized loads/stores. + + // Note: I'm assuming that the largest load/store to private + // is 8 bytes. If it is larger, the stride will have to increase + + Addr addr_div8 = addr / 8; + Addr addr_mod8 = addr % 8; + + Addr ret = addr_div8 * 8 * VSZ + lane * 8 + addr_mod8 + w->privBase; + + assert(ret < w->privBase + (w->privSizePerItem * VSZ)); + + return ret; + } + + template<typename MemDataType, typename DestDataType, + typename AddrRegOperandType> + void + LdInst<MemDataType, DestDataType, + AddrRegOperandType>::execute(GPUDynInstPtr gpuDynInst) + { + Wavefront *w = gpuDynInst->wavefront(); + + typedef typename MemDataType::CType MemCType; + const VectorMask &mask = w->get_pred(); + + // Kernarg references are handled uniquely for now (no Memory Request + // is used), so special-case them up front. Someday we should + // make this more realistic, at which we should get rid of this + // block and fold this case into the switch below. + if (this->segment == Brig::BRIG_SEGMENT_KERNARG) { + MemCType val; + + // I assume no vector ld for kernargs + assert(num_dest_operands == 1); + + // assuming for the moment that we'll never do register + // offsets into kernarg space... just to make life simpler + uint64_t address = this->addr.calcUniform(); + + val = *(MemCType*)&w->kernelArgs[address]; + + DPRINTF(HSAIL, "ld_kernarg [%d] -> %d\n", address, val); + + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + this->dest.set(w, lane, val); + } + } + + return; + } else if (this->segment == Brig::BRIG_SEGMENT_ARG) { + uint64_t address = this->addr.calcUniform(); + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + MemCType val = w->readCallArgMem<MemCType>(lane, address); + + DPRINTF(HSAIL, "ld_arg [%d] -> %llu\n", address, + (unsigned long long)val); + + this->dest.set(w, lane, val); + } + } + + return; + } + + GPUDynInstPtr m = gpuDynInst; + + this->addr.calcVector(w, m->addr); + + m->m_op = Enums::MO_LD; + m->m_type = MemDataType::memType; + m->v_type = DestDataType::vgprType; + + m->exec_mask = w->execMask(); + m->statusBitVector = 0; + m->equiv = this->equivClass; + m->memoryOrder = getGenericMemoryOrder(this->memoryOrder); + + m->scope = getGenericMemoryScope(this->memoryScope); + + if (num_dest_operands == 1) { + m->dst_reg = this->dest.regIndex(); + m->n_reg = 1; + } else { + m->n_reg = num_dest_operands; + for (int i = 0; i < num_dest_operands; ++i) { + m->dst_reg_vec[i] = this->dest_vect[i].regIndex(); + } + } + + m->simdId = w->simdId; + m->wfSlotId = w->wfSlotId; + m->wfDynId = w->wfDynId; + m->kern_id = w->kern_id; + m->cu_id = w->computeUnit->cu_id; + m->latency.init(&w->computeUnit->shader->tick_cnt); + + switch (this->segment) { + case Brig::BRIG_SEGMENT_GLOBAL: + m->s_type = SEG_GLOBAL; + m->pipeId = GLBMEM_PIPE; + m->latency.set(w->computeUnit->shader->ticks(1)); + + // this is a complete hack to get around a compiler bug + // (the compiler currently generates global access for private + // addresses (starting from 0). We need to add the private offset) + for (int lane = 0; lane < VSZ; ++lane) { + if (m->addr[lane] < w->privSizePerItem) { + if (mask[lane]) { + // what is the size of the object we are accessing? + // find base for for this wavefront + + // calcPrivAddr will fail if accesses are unaligned + assert(!((sizeof(MemCType) - 1) & m->addr[lane])); + + Addr privAddr = calcPrivAddr(m->addr[lane], w, lane, + this); + + m->addr[lane] = privAddr; + } + } + } + + w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m); + w->outstanding_reqs_rd_gm++; + w->rd_gm_reqs_in_pipe--; + break; + + case Brig::BRIG_SEGMENT_SPILL: + assert(num_dest_operands == 1); + m->s_type = SEG_SPILL; + m->pipeId = GLBMEM_PIPE; + m->latency.set(w->computeUnit->shader->ticks(1)); + { + for (int lane = 0; lane < VSZ; ++lane) { + // note: this calculation will NOT WORK if the compiler + // ever generates loads/stores to the same address with + // different widths (e.g., a ld_u32 addr and a ld_u16 addr) + if (mask[lane]) { + assert(m->addr[lane] < w->spillSizePerItem); + + m->addr[lane] = m->addr[lane] * w->spillWidth + + lane * sizeof(MemCType) + w->spillBase; + + w->last_addr[lane] = m->addr[lane]; + } + } + } + + w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m); + w->outstanding_reqs_rd_gm++; + w->rd_gm_reqs_in_pipe--; + break; + + case Brig::BRIG_SEGMENT_GROUP: + m->s_type = SEG_SHARED; + m->pipeId = LDSMEM_PIPE; + m->latency.set(w->computeUnit->shader->ticks(24)); + w->computeUnit->localMemoryPipe.getLMReqFIFO().push(m); + w->outstanding_reqs_rd_lm++; + w->rd_lm_reqs_in_pipe--; + break; + + case Brig::BRIG_SEGMENT_READONLY: + m->s_type = SEG_READONLY; + m->pipeId = GLBMEM_PIPE; + m->latency.set(w->computeUnit->shader->ticks(1)); + + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + assert(m->addr[lane] + sizeof(MemCType) <= w->roSize); + m->addr[lane] += w->roBase; + } + } + + w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m); + w->outstanding_reqs_rd_gm++; + w->rd_gm_reqs_in_pipe--; + break; + + case Brig::BRIG_SEGMENT_PRIVATE: + m->s_type = SEG_PRIVATE; + m->pipeId = GLBMEM_PIPE; + m->latency.set(w->computeUnit->shader->ticks(1)); + { + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + assert(m->addr[lane] < w->privSizePerItem); + + m->addr[lane] = m->addr[lane] + + lane * sizeof(MemCType) + w->privBase; + } + } + } + w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m); + w->outstanding_reqs_rd_gm++; + w->rd_gm_reqs_in_pipe--; + break; + + default: + fatal("Load to unsupported segment %d %llxe\n", this->segment, + m->addr[0]); + } + + w->outstanding_reqs++; + w->mem_reqs_in_pipe--; + } + + template<typename OperationType, typename SrcDataType, + typename AddrRegOperandType> + void + StInst<OperationType, SrcDataType, + AddrRegOperandType>::execute(GPUDynInstPtr gpuDynInst) + { + Wavefront *w = gpuDynInst->wavefront(); + + typedef typename OperationType::CType CType; + + const VectorMask &mask = w->get_pred(); + + // arg references are handled uniquely for now (no Memory Request + // is used), so special-case them up front. Someday we should + // make this more realistic, at which we should get rid of this + // block and fold this case into the switch below. + if (this->segment == Brig::BRIG_SEGMENT_ARG) { + uint64_t address = this->addr.calcUniform(); + + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + CType data = this->src.template get<CType>(w, lane); + DPRINTF(HSAIL, "st_arg [%d] <- %d\n", address, data); + w->writeCallArgMem<CType>(lane, address, data); + } + } + + return; + } + + GPUDynInstPtr m = gpuDynInst; + + m->exec_mask = w->execMask(); + + this->addr.calcVector(w, m->addr); + + if (num_src_operands == 1) { + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + ((CType*)m->d_data)[lane] = + this->src.template get<CType>(w, lane); + } + } + } else { + for (int k= 0; k < num_src_operands; ++k) { + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + ((CType*)m->d_data)[k * VSZ + lane] = + this->src_vect[k].template get<CType>(w, lane); + } + } + } + } + + m->m_op = Enums::MO_ST; + m->m_type = OperationType::memType; + m->v_type = OperationType::vgprType; + + m->statusBitVector = 0; + m->equiv = this->equivClass; + + if (num_src_operands == 1) { + m->n_reg = 1; + } else { + m->n_reg = num_src_operands; + } + + m->memoryOrder = getGenericMemoryOrder(this->memoryOrder); + + m->scope = getGenericMemoryScope(this->memoryScope); + + m->simdId = w->simdId; + m->wfSlotId = w->wfSlotId; + m->wfDynId = w->wfDynId; + m->kern_id = w->kern_id; + m->cu_id = w->computeUnit->cu_id; + m->latency.init(&w->computeUnit->shader->tick_cnt); + + switch (this->segment) { + case Brig::BRIG_SEGMENT_GLOBAL: + m->s_type = SEG_GLOBAL; + m->pipeId = GLBMEM_PIPE; + m->latency.set(w->computeUnit->shader->ticks(1)); + + // this is a complete hack to get around a compiler bug + // (the compiler currently generates global access for private + // addresses (starting from 0). We need to add the private offset) + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + if (m->addr[lane] < w->privSizePerItem) { + + // calcPrivAddr will fail if accesses are unaligned + assert(!((sizeof(CType)-1) & m->addr[lane])); + + Addr privAddr = calcPrivAddr(m->addr[lane], w, lane, + this); + + m->addr[lane] = privAddr; + } + } + } + + w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m); + w->outstanding_reqs_wr_gm++; + w->wr_gm_reqs_in_pipe--; + break; + + case Brig::BRIG_SEGMENT_SPILL: + assert(num_src_operands == 1); + m->s_type = SEG_SPILL; + m->pipeId = GLBMEM_PIPE; + m->latency.set(w->computeUnit->shader->ticks(1)); + { + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + assert(m->addr[lane] < w->spillSizePerItem); + + m->addr[lane] = m->addr[lane] * w->spillWidth + + lane * sizeof(CType) + w->spillBase; + } + } + } + + w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m); + w->outstanding_reqs_wr_gm++; + w->wr_gm_reqs_in_pipe--; + break; + + case Brig::BRIG_SEGMENT_GROUP: + m->s_type = SEG_SHARED; + m->pipeId = LDSMEM_PIPE; + m->latency.set(w->computeUnit->shader->ticks(24)); + w->computeUnit->localMemoryPipe.getLMReqFIFO().push(m); + w->outstanding_reqs_wr_lm++; + w->wr_lm_reqs_in_pipe--; + break; + + case Brig::BRIG_SEGMENT_PRIVATE: + m->s_type = SEG_PRIVATE; + m->pipeId = GLBMEM_PIPE; + m->latency.set(w->computeUnit->shader->ticks(1)); + { + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + assert(m->addr[lane] < w->privSizePerItem); + m->addr[lane] = m->addr[lane] + lane * + sizeof(CType)+w->privBase; + } + } + } + + w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m); + w->outstanding_reqs_wr_gm++; + w->wr_gm_reqs_in_pipe--; + break; + + default: + fatal("Store to unsupported segment %d\n", this->segment); + } + + w->outstanding_reqs++; + w->mem_reqs_in_pipe--; + } + + template<typename OperationType, typename SrcDataType, + typename AddrRegOperandType> + void + StInst<OperationType, SrcDataType, + AddrRegOperandType>::generateDisassembly() + { + switch (num_src_operands) { + case 1: + this->disassembly = csprintf("%s_%s_%s %s,%s", this->opcode, + segmentNames[this->segment], + OperationType::label, + this->src.disassemble(), + this->addr.disassemble()); + break; + case 2: + this->disassembly = csprintf("%s_%s_%s (%s,%s), %s", this->opcode, + segmentNames[this->segment], + OperationType::label, + this->src_vect[0].disassemble(), + this->src_vect[1].disassemble(), + this->addr.disassemble()); + break; + case 4: + this->disassembly = csprintf("%s_%s_%s (%s,%s,%s,%s), %s", + this->opcode, + segmentNames[this->segment], + OperationType::label, + this->src_vect[0].disassemble(), + this->src_vect[1].disassemble(), + this->src_vect[2].disassemble(), + this->src_vect[3].disassemble(), + this->addr.disassemble()); + break; + default: fatal("Bad ld register src operand, num vector operands: " + "%d \n", num_src_operands); + break; + } + } + + template<typename DataType, typename AddrRegOperandType, int NumSrcOperands, + bool HasDst> + void + AtomicInst<DataType, AddrRegOperandType, NumSrcOperands, + HasDst>::execute(GPUDynInstPtr gpuDynInst) + { + typedef typename DataType::CType CType; + + Wavefront *w = gpuDynInst->wavefront(); + + GPUDynInstPtr m = gpuDynInst; + + this->addr.calcVector(w, m->addr); + + for (int lane = 0; lane < VSZ; ++lane) { + ((CType *)m->a_data)[lane] = + this->src[0].template get<CType>(w, lane); + } + + // load second source operand for CAS + if (NumSrcOperands > 1) { + for (int lane = 0; lane < VSZ; ++lane) { + ((CType*)m->x_data)[lane] = + this->src[1].template get<CType>(w, lane); + } + } + + assert(NumSrcOperands <= 2); + + m->m_op = this->opType; + m->m_type = DataType::memType; + m->v_type = DataType::vgprType; + + m->exec_mask = w->execMask(); + m->statusBitVector = 0; + m->equiv = 0; // atomics don't have an equivalence class operand + m->n_reg = 1; + m->memoryOrder = getGenericMemoryOrder(this->memoryOrder); + + m->scope = getGenericMemoryScope(this->memoryScope); + + if (HasDst) { + m->dst_reg = this->dest.regIndex(); + } + + m->simdId = w->simdId; + m->wfSlotId = w->wfSlotId; + m->wfDynId = w->wfDynId; + m->kern_id = w->kern_id; + m->cu_id = w->computeUnit->cu_id; + m->latency.init(&w->computeUnit->shader->tick_cnt); + + switch (this->segment) { + case Brig::BRIG_SEGMENT_GLOBAL: + m->s_type = SEG_GLOBAL; + m->latency.set(w->computeUnit->shader->ticks(64)); + m->pipeId = GLBMEM_PIPE; + + w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m); + w->outstanding_reqs_wr_gm++; + w->wr_gm_reqs_in_pipe--; + w->outstanding_reqs_rd_gm++; + w->rd_gm_reqs_in_pipe--; + break; + + case Brig::BRIG_SEGMENT_GROUP: + m->s_type = SEG_SHARED; + m->pipeId = LDSMEM_PIPE; + m->latency.set(w->computeUnit->shader->ticks(24)); + w->computeUnit->localMemoryPipe.getLMReqFIFO().push(m); + w->outstanding_reqs_wr_lm++; + w->wr_lm_reqs_in_pipe--; + w->outstanding_reqs_rd_lm++; + w->rd_lm_reqs_in_pipe--; + break; + + default: + fatal("Atomic op to unsupported segment %d\n", + this->segment); + } + + w->outstanding_reqs++; + w->mem_reqs_in_pipe--; + } + + const char* atomicOpToString(Brig::BrigAtomicOperation atomicOp); + + template<typename DataType, typename AddrRegOperandType, int NumSrcOperands, + bool HasDst> + void + AtomicInst<DataType, AddrRegOperandType, NumSrcOperands, + HasDst>::generateDisassembly() + { + if (HasDst) { + this->disassembly = + csprintf("%s_%s_%s_%s %s,%s", this->opcode, + atomicOpToString(this->atomicOperation), + segmentNames[this->segment], + DataType::label, this->dest.disassemble(), + this->addr.disassemble()); + } else { + this->disassembly = + csprintf("%s_%s_%s_%s %s", this->opcode, + atomicOpToString(this->atomicOperation), + segmentNames[this->segment], + DataType::label, this->addr.disassemble()); + } + + for (int i = 0; i < NumSrcOperands; ++i) { + this->disassembly += ","; + this->disassembly += this->src[i].disassemble(); + } + } +} // namespace HsailISA diff --git a/src/arch/hsail/insts/pseudo_inst.cc b/src/arch/hsail/insts/pseudo_inst.cc new file mode 100644 index 000000000..9506a80ab --- /dev/null +++ b/src/arch/hsail/insts/pseudo_inst.cc @@ -0,0 +1,787 @@ +/* + * Copyright (c) 2013-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Marc Orr + */ + +#include <csignal> + +#include "arch/hsail/insts/decl.hh" +#include "arch/hsail/insts/mem.hh" + +namespace HsailISA +{ + // Pseudo (or magic) instructions are overloaded on the hsail call + // instruction, because of its flexible parameter signature. + + // To add a new magic instruction: + // 1. Add an entry to the enum. + // 2. Implement it in the switch statement below (Call::exec). + // 3. Add a utility function to hsa/hsail-gpu-compute/util/magicinst.h, + // so its easy to call from an OpenCL kernel. + + // This enum should be identical to the enum in + // hsa/hsail-gpu-compute/util/magicinst.h + enum + { + MAGIC_PRINT_WF_32 = 0, + MAGIC_PRINT_WF_64, + MAGIC_PRINT_LANE, + MAGIC_PRINT_LANE_64, + MAGIC_PRINT_WF_FLOAT, + MAGIC_SIM_BREAK, + MAGIC_PREF_SUM, + MAGIC_REDUCTION, + MAGIC_MASKLANE_LOWER, + MAGIC_MASKLANE_UPPER, + MAGIC_JOIN_WF_BAR, + MAGIC_WAIT_WF_BAR, + MAGIC_PANIC, + MAGIC_ATOMIC_NR_ADD_GLOBAL_U32_REG, + MAGIC_ATOMIC_NR_ADD_GROUP_U32_REG, + MAGIC_LOAD_GLOBAL_U32_REG, + MAGIC_XACT_CAS_LD, + MAGIC_MOST_SIG_THD, + MAGIC_MOST_SIG_BROADCAST, + MAGIC_PRINT_WFID_32, + MAGIC_PRINT_WFID_64 + }; + + void + Call::execPseudoInst(Wavefront *w, GPUDynInstPtr gpuDynInst) + { + const VectorMask &mask = w->get_pred(); + + int op = 0; + bool got_op = false; + + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + int src_val0 = src1.get<int>(w, lane, 0); + if (got_op) { + if (src_val0 != op) { + fatal("Multiple magic instructions per PC not " + "supported\n"); + } + } else { + op = src_val0; + got_op = true; + } + } + } + + switch(op) { + case MAGIC_PRINT_WF_32: + MagicPrintWF32(w); + break; + case MAGIC_PRINT_WF_64: + MagicPrintWF64(w); + break; + case MAGIC_PRINT_LANE: + MagicPrintLane(w); + break; + case MAGIC_PRINT_LANE_64: + MagicPrintLane64(w); + break; + case MAGIC_PRINT_WF_FLOAT: + MagicPrintWFFloat(w); + break; + case MAGIC_SIM_BREAK: + MagicSimBreak(w); + break; + case MAGIC_PREF_SUM: + MagicPrefixSum(w); + break; + case MAGIC_REDUCTION: + MagicReduction(w); + break; + case MAGIC_MASKLANE_LOWER: + MagicMaskLower(w); + break; + case MAGIC_MASKLANE_UPPER: + MagicMaskUpper(w); + break; + case MAGIC_JOIN_WF_BAR: + MagicJoinWFBar(w); + break; + case MAGIC_WAIT_WF_BAR: + MagicWaitWFBar(w); + break; + case MAGIC_PANIC: + MagicPanic(w); + break; + + // atomic instructions + case MAGIC_ATOMIC_NR_ADD_GLOBAL_U32_REG: + MagicAtomicNRAddGlobalU32Reg(w, gpuDynInst); + break; + + case MAGIC_ATOMIC_NR_ADD_GROUP_U32_REG: + MagicAtomicNRAddGroupU32Reg(w, gpuDynInst); + break; + + case MAGIC_LOAD_GLOBAL_U32_REG: + MagicLoadGlobalU32Reg(w, gpuDynInst); + break; + + case MAGIC_XACT_CAS_LD: + MagicXactCasLd(w); + break; + + case MAGIC_MOST_SIG_THD: + MagicMostSigThread(w); + break; + + case MAGIC_MOST_SIG_BROADCAST: + MagicMostSigBroadcast(w); + break; + + case MAGIC_PRINT_WFID_32: + MagicPrintWF32ID(w); + break; + + case MAGIC_PRINT_WFID_64: + MagicPrintWFID64(w); + break; + + default: fatal("unrecognized magic instruction: %d\n", op); + } + } + + void + Call::MagicPrintLane(Wavefront *w) + { + #if TRACING_ON + const VectorMask &mask = w->get_pred(); + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + int src_val1 = src1.get<int>(w, lane, 1); + int src_val2 = src1.get<int>(w, lane, 2); + if (src_val2) { + DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: 0x%x\n", + disassemble(), w->computeUnit->cu_id, w->simdId, + w->wfSlotId, lane, src_val1); + } else { + DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: %d\n", + disassemble(), w->computeUnit->cu_id, w->simdId, + w->wfSlotId, lane, src_val1); + } + } + } + #endif + } + + void + Call::MagicPrintLane64(Wavefront *w) + { + #if TRACING_ON + const VectorMask &mask = w->get_pred(); + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + int64_t src_val1 = src1.get<int64_t>(w, lane, 1); + int src_val2 = src1.get<int>(w, lane, 2); + if (src_val2) { + DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: 0x%x\n", + disassemble(), w->computeUnit->cu_id, w->simdId, + w->wfSlotId, lane, src_val1); + } else { + DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: %d\n", + disassemble(), w->computeUnit->cu_id, w->simdId, + w->wfSlotId, lane, src_val1); + } + } + } + #endif + } + + void + Call::MagicPrintWF32(Wavefront *w) + { + #if TRACING_ON + const VectorMask &mask = w->get_pred(); + std::string res_str; + res_str = csprintf("krl_prt (%s)\n", disassemble()); + + for (int lane = 0; lane < VSZ; ++lane) { + if (!(lane & 7)) { + res_str += csprintf("DB%03d: ", (int)w->wfDynId); + } + + if (mask[lane]) { + int src_val1 = src1.get<int>(w, lane, 1); + int src_val2 = src1.get<int>(w, lane, 2); + + if (src_val2) { + res_str += csprintf("%08x", src_val1); + } else { + res_str += csprintf("%08d", src_val1); + } + } else { + res_str += csprintf("xxxxxxxx"); + } + + if ((lane & 7) == 7) { + res_str += csprintf("\n"); + } else { + res_str += csprintf(" "); + } + } + + res_str += "\n\n"; + DPRINTFN(res_str.c_str()); + #endif + } + + void + Call::MagicPrintWF32ID(Wavefront *w) + { + #if TRACING_ON + const VectorMask &mask = w->get_pred(); + std::string res_str; + int src_val3 = -1; + res_str = csprintf("krl_prt (%s)\n", disassemble()); + + for (int lane = 0; lane < VSZ; ++lane) { + if (!(lane & 7)) { + res_str += csprintf("DB%03d: ", (int)w->wfDynId); + } + + if (mask[lane]) { + int src_val1 = src1.get<int>(w, lane, 1); + int src_val2 = src1.get<int>(w, lane, 2); + src_val3 = src1.get<int>(w, lane, 3); + + if (src_val2) { + res_str += csprintf("%08x", src_val1); + } else { + res_str += csprintf("%08d", src_val1); + } + } else { + res_str += csprintf("xxxxxxxx"); + } + + if ((lane & 7) == 7) { + res_str += csprintf("\n"); + } else { + res_str += csprintf(" "); + } + } + + res_str += "\n\n"; + if (w->wfDynId == src_val3) { + DPRINTFN(res_str.c_str()); + } + #endif + } + + void + Call::MagicPrintWF64(Wavefront *w) + { + #if TRACING_ON + const VectorMask &mask = w->get_pred(); + std::string res_str; + res_str = csprintf("krl_prt (%s)\n", disassemble()); + + for (int lane = 0; lane < VSZ; ++lane) { + if (!(lane & 3)) { + res_str += csprintf("DB%03d: ", (int)w->wfDynId); + } + + if (mask[lane]) { + int64_t src_val1 = src1.get<int64_t>(w, lane, 1); + int src_val2 = src1.get<int>(w, lane, 2); + + if (src_val2) { + res_str += csprintf("%016x", src_val1); + } else { + res_str += csprintf("%016d", src_val1); + } + } else { + res_str += csprintf("xxxxxxxxxxxxxxxx"); + } + + if ((lane & 3) == 3) { + res_str += csprintf("\n"); + } else { + res_str += csprintf(" "); + } + } + + res_str += "\n\n"; + DPRINTFN(res_str.c_str()); + #endif + } + + void + Call::MagicPrintWFID64(Wavefront *w) + { + #if TRACING_ON + const VectorMask &mask = w->get_pred(); + std::string res_str; + int src_val3 = -1; + res_str = csprintf("krl_prt (%s)\n", disassemble()); + + for (int lane = 0; lane < VSZ; ++lane) { + if (!(lane & 3)) { + res_str += csprintf("DB%03d: ", (int)w->wfDynId); + } + + if (mask[lane]) { + int64_t src_val1 = src1.get<int64_t>(w, lane, 1); + int src_val2 = src1.get<int>(w, lane, 2); + src_val3 = src1.get<int>(w, lane, 3); + + if (src_val2) { + res_str += csprintf("%016x", src_val1); + } else { + res_str += csprintf("%016d", src_val1); + } + } else { + res_str += csprintf("xxxxxxxxxxxxxxxx"); + } + + if ((lane & 3) == 3) { + res_str += csprintf("\n"); + } else { + res_str += csprintf(" "); + } + } + + res_str += "\n\n"; + if (w->wfDynId == src_val3) { + DPRINTFN(res_str.c_str()); + } + #endif + } + + void + Call::MagicPrintWFFloat(Wavefront *w) + { + #if TRACING_ON + const VectorMask &mask = w->get_pred(); + std::string res_str; + res_str = csprintf("krl_prt (%s)\n", disassemble()); + + for (int lane = 0; lane < VSZ; ++lane) { + if (!(lane & 7)) { + res_str += csprintf("DB%03d: ", (int)w->wfDynId); + } + + if (mask[lane]) { + float src_val1 = src1.get<float>(w, lane, 1); + res_str += csprintf("%08f", src_val1); + } else { + res_str += csprintf("xxxxxxxx"); + } + + if ((lane & 7) == 7) { + res_str += csprintf("\n"); + } else { + res_str += csprintf(" "); + } + } + + res_str += "\n\n"; + DPRINTFN(res_str.c_str()); + #endif + } + + // raises a signal that GDB will catch + // when done with the break, type "signal 0" in gdb to continue + void + Call::MagicSimBreak(Wavefront *w) + { + std::string res_str; + // print out state for this wavefront and then break + res_str = csprintf("Breakpoint encountered for wavefront %i\n", + w->wfSlotId); + + res_str += csprintf(" Kern ID: %i\n", w->kern_id); + res_str += csprintf(" Phase ID: %i\n", w->simdId); + res_str += csprintf(" Executing on CU #%i\n", w->computeUnit->cu_id); + res_str += csprintf(" Exec mask: "); + + for (int i = VSZ - 1; i >= 0; --i) { + if (w->execMask(i)) + res_str += "1"; + else + res_str += "0"; + + if ((i & 7) == 7) + res_str += " "; + } + + res_str += csprintf("(0x%016llx)\n", w->execMask().to_ullong()); + + res_str += "\nHelpful debugging hints:\n"; + res_str += " Check out w->s_reg / w->d_reg for register state\n"; + + res_str += "\n\n"; + DPRINTFN(res_str.c_str()); + fflush(stdout); + + raise(SIGTRAP); + } + + void + Call::MagicPrefixSum(Wavefront *w) + { + const VectorMask &mask = w->get_pred(); + int res = 0; + + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + int src_val1 = src1.get<int>(w, lane, 1); + dest.set<int>(w, lane, res); + res += src_val1; + } + } + } + + void + Call::MagicReduction(Wavefront *w) + { + // reduction magic instruction + // The reduction instruction takes up to 64 inputs (one from + // each thread in a WF) and sums them. It returns the sum to + // each thread in the WF. + const VectorMask &mask = w->get_pred(); + int res = 0; + + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + int src_val1 = src1.get<int>(w, lane, 1); + res += src_val1; + } + } + + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + dest.set<int>(w, lane, res); + } + } + } + + void + Call::MagicMaskLower(Wavefront *w) + { + const VectorMask &mask = w->get_pred(); + int res = 0; + + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + int src_val1 = src1.get<int>(w, lane, 1); + + if (src_val1) { + if (lane < (VSZ/2)) { + res = res | ((uint32_t)(1) << lane); + } + } + } + } + + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + dest.set<int>(w, lane, res); + } + } + } + + void + Call::MagicMaskUpper(Wavefront *w) + { + const VectorMask &mask = w->get_pred(); + int res = 0; + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + int src_val1 = src1.get<int>(w, lane, 1); + + if (src_val1) { + if (lane >= (VSZ/2)) { + res = res | ((uint32_t)(1) << (lane - (VSZ/2))); + } + } + } + } + + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + dest.set<int>(w, lane, res); + } + } + } + + void + Call::MagicJoinWFBar(Wavefront *w) + { + const VectorMask &mask = w->get_pred(); + int max_cnt = 0; + + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + w->bar_cnt[lane]++; + + if (w->bar_cnt[lane] > max_cnt) { + max_cnt = w->bar_cnt[lane]; + } + } + } + + if (max_cnt > w->max_bar_cnt) { + w->max_bar_cnt = max_cnt; + } + } + + void + Call::MagicWaitWFBar(Wavefront *w) + { + const VectorMask &mask = w->get_pred(); + int max_cnt = 0; + + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + w->bar_cnt[lane]--; + } + + if (w->bar_cnt[lane] > max_cnt) { + max_cnt = w->bar_cnt[lane]; + } + } + + if (max_cnt < w->max_bar_cnt) { + w->max_bar_cnt = max_cnt; + } + + w->instructionBuffer.erase(w->instructionBuffer.begin() + 1, + w->instructionBuffer.end()); + if (w->pendingFetch) + w->dropFetch = true; + } + + void + Call::MagicPanic(Wavefront *w) + { + const VectorMask &mask = w->get_pred(); + + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + int src_val1 = src1.get<int>(w, lane, 1); + panic("OpenCL Code failed assertion #%d. Triggered by lane %s", + src_val1, lane); + } + } + } + + void + Call::calcAddr(Wavefront *w, GPUDynInstPtr m) + { + // the address is in src1 | src2 + for (int lane = 0; lane < VSZ; ++lane) { + int src_val1 = src1.get<int>(w, lane, 1); + int src_val2 = src1.get<int>(w, lane, 2); + Addr addr = (((Addr) src_val1) << 32) | ((Addr) src_val2); + + m->addr[lane] = addr; + } + + } + + void + Call::MagicAtomicNRAddGlobalU32Reg(Wavefront *w, GPUDynInstPtr gpuDynInst) + { + GPUDynInstPtr m = gpuDynInst; + + calcAddr(w, m); + + for (int lane = 0; lane < VSZ; ++lane) { + ((int*)m->a_data)[lane] = src1.get<int>(w, lane, 3); + } + + m->m_op = brigAtomicToMemOpType(Brig::BRIG_OPCODE_ATOMICNORET, + Brig::BRIG_ATOMIC_ADD); + m->m_type = U32::memType; + m->v_type = U32::vgprType; + + m->exec_mask = w->execMask(); + m->statusBitVector = 0; + m->equiv = 0; // atomics don't have an equivalence class operand + m->n_reg = 1; + m->memoryOrder = Enums::MEMORY_ORDER_NONE; + m->scope = Enums::MEMORY_SCOPE_NONE; + + m->simdId = w->simdId; + m->wfSlotId = w->wfSlotId; + m->wfDynId = w->wfDynId; + m->latency.init(&w->computeUnit->shader->tick_cnt); + + m->s_type = SEG_GLOBAL; + m->pipeId = GLBMEM_PIPE; + m->latency.set(w->computeUnit->shader->ticks(64)); + w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m); + w->outstanding_reqs_wr_gm++; + w->wr_gm_reqs_in_pipe--; + w->outstanding_reqs_rd_gm++; + w->rd_gm_reqs_in_pipe--; + w->outstanding_reqs++; + w->mem_reqs_in_pipe--; + } + + void + Call::MagicAtomicNRAddGroupU32Reg(Wavefront *w, GPUDynInstPtr gpuDynInst) + { + GPUDynInstPtr m = gpuDynInst; + calcAddr(w, m); + + for (int lane = 0; lane < VSZ; ++lane) { + ((int*)m->a_data)[lane] = src1.get<int>(w, lane, 1); + } + + m->m_op = brigAtomicToMemOpType(Brig::BRIG_OPCODE_ATOMICNORET, + Brig::BRIG_ATOMIC_ADD); + m->m_type = U32::memType; + m->v_type = U32::vgprType; + + m->exec_mask = w->execMask(); + m->statusBitVector = 0; + m->equiv = 0; // atomics don't have an equivalence class operand + m->n_reg = 1; + m->memoryOrder = Enums::MEMORY_ORDER_NONE; + m->scope = Enums::MEMORY_SCOPE_NONE; + + m->simdId = w->simdId; + m->wfSlotId = w->wfSlotId; + m->wfDynId = w->wfDynId; + m->latency.init(&w->computeUnit->shader->tick_cnt); + + m->s_type = SEG_GLOBAL; + m->pipeId = GLBMEM_PIPE; + m->latency.set(w->computeUnit->shader->ticks(64)); + w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m); + w->outstanding_reqs_wr_gm++; + w->wr_gm_reqs_in_pipe--; + w->outstanding_reqs_rd_gm++; + w->rd_gm_reqs_in_pipe--; + w->outstanding_reqs++; + w->mem_reqs_in_pipe--; + } + + void + Call::MagicLoadGlobalU32Reg(Wavefront *w, GPUDynInstPtr gpuDynInst) + { + GPUDynInstPtr m = gpuDynInst; + // calculate the address + calcAddr(w, m); + + m->m_op = Enums::MO_LD; + m->m_type = U32::memType; //MemDataType::memType; + m->v_type = U32::vgprType; //DestDataType::vgprType; + + m->exec_mask = w->execMask(); + m->statusBitVector = 0; + m->equiv = 0; + m->n_reg = 1; + m->memoryOrder = Enums::MEMORY_ORDER_NONE; + m->scope = Enums::MEMORY_SCOPE_NONE; + + // FIXME + //m->dst_reg = this->dest.regIndex(); + + m->simdId = w->simdId; + m->wfSlotId = w->wfSlotId; + m->wfDynId = w->wfDynId; + m->latency.init(&w->computeUnit->shader->tick_cnt); + + m->s_type = SEG_GLOBAL; + m->pipeId = GLBMEM_PIPE; + m->latency.set(w->computeUnit->shader->ticks(1)); + w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m); + w->outstanding_reqs_rd_gm++; + w->rd_gm_reqs_in_pipe--; + w->outstanding_reqs++; + w->mem_reqs_in_pipe--; + } + + void + Call::MagicXactCasLd(Wavefront *w) + { + const VectorMask &mask = w->get_pred(); + int src_val1 = 0; + + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + src_val1 = src1.get<int>(w, lane, 1); + break; + } + } + + if (!w->computeUnit->xactCasLoadMap.count(src_val1)) { + w->computeUnit->xactCasLoadMap[src_val1] = ComputeUnit::waveQueue(); + w->computeUnit->xactCasLoadMap[src_val1].waveIDQueue.clear(); + } + + w->computeUnit->xactCasLoadMap[src_val1].waveIDQueue + .push_back(ComputeUnit::waveIdentifier(w->simdId, w->wfSlotId)); + } + + void + Call::MagicMostSigThread(Wavefront *w) + { + const VectorMask &mask = w->get_pred(); + unsigned mst = true; + + for (int lane = VSZ - 1; lane >= 0; --lane) { + if (mask[lane]) { + dest.set<int>(w, lane, mst); + mst = false; + } + } + } + + void + Call::MagicMostSigBroadcast(Wavefront *w) + { + const VectorMask &mask = w->get_pred(); + int res = 0; + bool got_res = false; + + for (int lane = VSZ - 1; lane >= 0; --lane) { + if (mask[lane]) { + if (!got_res) { + res = src1.get<int>(w, lane, 1); + got_res = true; + } + dest.set<int>(w, lane, res); + } + } + } + +} // namespace HsailISA |