summaryrefslogtreecommitdiff
path: root/src/arch/hsail/insts
diff options
context:
space:
mode:
Diffstat (limited to 'src/arch/hsail/insts')
-rw-r--r--src/arch/hsail/insts/branch.cc86
-rw-r--r--src/arch/hsail/insts/branch.hh442
-rw-r--r--src/arch/hsail/insts/decl.hh1106
-rw-r--r--src/arch/hsail/insts/gpu_static_inst.cc64
-rw-r--r--src/arch/hsail/insts/gpu_static_inst.hh65
-rw-r--r--src/arch/hsail/insts/main.cc208
-rw-r--r--src/arch/hsail/insts/mem.cc139
-rw-r--r--src/arch/hsail/insts/mem.hh1629
-rw-r--r--src/arch/hsail/insts/mem_impl.hh660
-rw-r--r--src/arch/hsail/insts/pseudo_inst.cc787
10 files changed, 5186 insertions, 0 deletions
diff --git a/src/arch/hsail/insts/branch.cc b/src/arch/hsail/insts/branch.cc
new file mode 100644
index 000000000..d65279cc8
--- /dev/null
+++ b/src/arch/hsail/insts/branch.cc
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#include "arch/hsail/insts/branch.hh"
+
+#include "gpu-compute/hsail_code.hh"
+
+namespace HsailISA
+{
+ GPUStaticInst*
+ decodeBrn(const Brig::BrigInstBase *ib, const BrigObject *obj)
+ {
+ // Detect direct vs indirect branch by seeing whether we have a
+ // register operand.
+ unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+ const Brig::BrigOperand *reg = obj->getOperand(op_offs);
+
+ if (reg->kind == Brig::BRIG_KIND_OPERAND_REGISTER) {
+ return new BrnIndirectInst(ib, obj);
+ } else {
+ return new BrnDirectInst(ib, obj);
+ }
+ }
+
+ GPUStaticInst*
+ decodeCbr(const Brig::BrigInstBase *ib, const BrigObject *obj)
+ {
+ // Detect direct vs indirect branch by seeing whether we have a
+ // second register operand (after the condition).
+ unsigned op_offs = obj->getOperandPtr(ib->operands, 1);
+ const Brig::BrigOperand *reg = obj->getOperand(op_offs);
+
+ if (reg->kind == Brig::BRIG_KIND_OPERAND_REGISTER) {
+ return new CbrIndirectInst(ib, obj);
+ } else {
+ return new CbrDirectInst(ib, obj);
+ }
+ }
+
+ GPUStaticInst*
+ decodeBr(const Brig::BrigInstBase *ib, const BrigObject *obj)
+ {
+ // Detect direct vs indirect branch by seeing whether we have a
+ // second register operand (after the condition).
+ unsigned op_offs = obj->getOperandPtr(ib->operands, 1);
+ const Brig::BrigOperand *reg = obj->getOperand(op_offs);
+
+ if (reg->kind == Brig::BRIG_KIND_OPERAND_REGISTER) {
+ return new BrIndirectInst(ib, obj);
+ } else {
+ return new BrDirectInst(ib, obj);
+ }
+ }
+} // namespace HsailISA
diff --git a/src/arch/hsail/insts/branch.hh b/src/arch/hsail/insts/branch.hh
new file mode 100644
index 000000000..54ad9a042
--- /dev/null
+++ b/src/arch/hsail/insts/branch.hh
@@ -0,0 +1,442 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#ifndef __ARCH_HSAIL_INSTS_BRANCH_HH__
+#define __ARCH_HSAIL_INSTS_BRANCH_HH__
+
+#include "arch/hsail/insts/gpu_static_inst.hh"
+#include "arch/hsail/operand.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/wavefront.hh"
+
+namespace HsailISA
+{
+
+ // The main difference between a direct branch and an indirect branch
+ // is whether the target is a register or a label, so we can share a
+ // lot of code if we template the base implementation on that type.
+ template<typename TargetType>
+ class BrnInstBase : public HsailGPUStaticInst
+ {
+ public:
+ void generateDisassembly();
+
+ Brig::BrigWidth8_t width;
+ TargetType target;
+
+ BrnInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj)
+ : HsailGPUStaticInst(obj, "brn")
+ {
+ o_type = Enums::OT_BRANCH;
+ width = ((Brig::BrigInstBr*)ib)->width;
+ unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+ target.init(op_offs, obj);
+ o_type = Enums::OT_BRANCH;
+ }
+
+ uint32_t getTargetPc() override { return target.getTarget(0, 0); }
+
+ bool unconditionalJumpInstruction() override { return true; }
+ bool isVectorRegister(int operandIndex) {
+ assert(operandIndex >= 0 && operandIndex < getNumOperands());
+ return target.isVectorRegister();
+ }
+ bool isCondRegister(int operandIndex) {
+ assert(operandIndex >= 0 && operandIndex < getNumOperands());
+ return target.isCondRegister();
+ }
+ bool isScalarRegister(int operandIndex) {
+ assert(operandIndex >= 0 && operandIndex < getNumOperands());
+ return target.isScalarRegister();
+ }
+
+ bool isSrcOperand(int operandIndex) {
+ assert(operandIndex >= 0 && operandIndex < getNumOperands());
+ return true;
+ }
+
+ bool isDstOperand(int operandIndex) {
+ return false;
+ }
+
+ int getOperandSize(int operandIndex) {
+ assert(operandIndex >= 0 && operandIndex < getNumOperands());
+ return target.opSize();
+ }
+
+ int getRegisterIndex(int operandIndex) {
+ assert(operandIndex >= 0 && operandIndex < getNumOperands());
+ return target.regIndex();
+ }
+
+ int getNumOperands() {
+ return 1;
+ }
+
+ void execute(GPUDynInstPtr gpuDynInst);
+ };
+
+ template<typename TargetType>
+ void
+ BrnInstBase<TargetType>::generateDisassembly()
+ {
+ std::string widthClause;
+
+ if (width != 1) {
+ widthClause = csprintf("_width(%d)", width);
+ }
+
+ disassembly = csprintf("%s%s %s", opcode, widthClause,
+ target.disassemble());
+ }
+
+ template<typename TargetType>
+ void
+ BrnInstBase<TargetType>::execute(GPUDynInstPtr gpuDynInst)
+ {
+ Wavefront *w = gpuDynInst->wavefront();
+
+ if (getTargetPc() == w->rpc()) {
+ w->popFromReconvergenceStack();
+ } else {
+ // Rpc and execution mask remain the same
+ w->pc(getTargetPc());
+ }
+ w->discardFetch();
+ }
+
+ class BrnDirectInst : public BrnInstBase<LabelOperand>
+ {
+ public:
+ BrnDirectInst(const Brig::BrigInstBase *ib, const BrigObject *obj)
+ : BrnInstBase<LabelOperand>(ib, obj)
+ {
+ }
+ int numSrcRegOperands() { return 0; }
+ int numDstRegOperands() { return 0; }
+ };
+
+ class BrnIndirectInst : public BrnInstBase<SRegOperand>
+ {
+ public:
+ BrnIndirectInst(const Brig::BrigInstBase *ib, const BrigObject *obj)
+ : BrnInstBase<SRegOperand>(ib, obj)
+ {
+ }
+ int numSrcRegOperands() { return target.isVectorRegister(); }
+ int numDstRegOperands() { return 0; }
+ };
+
+ GPUStaticInst* decodeBrn(const Brig::BrigInstBase *ib,
+ const BrigObject *obj);
+
+ template<typename TargetType>
+ class CbrInstBase : public HsailGPUStaticInst
+ {
+ public:
+ void generateDisassembly();
+
+ Brig::BrigWidth8_t width;
+ CRegOperand cond;
+ TargetType target;
+
+ CbrInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj)
+ : HsailGPUStaticInst(obj, "cbr")
+ {
+ o_type = Enums::OT_BRANCH;
+ width = ((Brig::BrigInstBr *)ib)->width;
+ unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+ cond.init(op_offs, obj);
+ op_offs = obj->getOperandPtr(ib->operands, 1);
+ target.init(op_offs, obj);
+ o_type = Enums::OT_BRANCH;
+ }
+
+ uint32_t getTargetPc() override { return target.getTarget(0, 0); }
+
+ void execute(GPUDynInstPtr gpuDynInst);
+ // Assumption: Target is operand 0, Condition Register is operand 1
+ bool isVectorRegister(int operandIndex) {
+ assert(operandIndex >= 0 && operandIndex < getNumOperands());
+ if (!operandIndex)
+ return target.isVectorRegister();
+ else
+ return false;
+ }
+ bool isCondRegister(int operandIndex) {
+ assert(operandIndex >= 0 && operandIndex < getNumOperands());
+ if (!operandIndex)
+ return target.isCondRegister();
+ else
+ return true;
+ }
+ bool isScalarRegister(int operandIndex) {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if (!operandIndex)
+ return target.isScalarRegister();
+ else
+ return false;
+ }
+ bool isSrcOperand(int operandIndex) {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if (operandIndex == 0)
+ return true;
+ return false;
+ }
+ // both Condition Register and Target are source operands
+ bool isDstOperand(int operandIndex) {
+ return false;
+ }
+ int getOperandSize(int operandIndex) {
+ assert(operandIndex >= 0 && operandIndex < getNumOperands());
+ if (!operandIndex)
+ return target.opSize();
+ else
+ return 1;
+ }
+ int getRegisterIndex(int operandIndex) {
+ assert(operandIndex >= 0 && operandIndex < getNumOperands());
+ if (!operandIndex)
+ return target.regIndex();
+ else
+ return -1;
+ }
+
+ // Operands = Target, Condition Register
+ int getNumOperands() {
+ return 2;
+ }
+ };
+
+ template<typename TargetType>
+ void
+ CbrInstBase<TargetType>::generateDisassembly()
+ {
+ std::string widthClause;
+
+ if (width != 1) {
+ widthClause = csprintf("_width(%d)", width);
+ }
+
+ disassembly = csprintf("%s%s %s,%s", opcode, widthClause,
+ cond.disassemble(), target.disassemble());
+ }
+
+ template<typename TargetType>
+ void
+ CbrInstBase<TargetType>::execute(GPUDynInstPtr gpuDynInst)
+ {
+ Wavefront *w = gpuDynInst->wavefront();
+
+ const uint32_t curr_pc = w->pc();
+ const uint32_t curr_rpc = w->rpc();
+ const VectorMask curr_mask = w->execMask();
+
+ /**
+ * TODO: can we move this pop outside the instruction, and
+ * into the wavefront?
+ */
+ w->popFromReconvergenceStack();
+
+ // immediate post-dominator instruction
+ const uint32_t rpc = static_cast<uint32_t>(ipdInstNum());
+ if (curr_rpc != rpc) {
+ w->pushToReconvergenceStack(rpc, curr_rpc, curr_mask);
+ }
+
+ // taken branch
+ const uint32_t true_pc = getTargetPc();
+ VectorMask true_mask;
+ for (unsigned int lane = 0; lane < VSZ; ++lane) {
+ true_mask[lane] = cond.get<bool>(w, lane) & curr_mask[lane];
+ }
+
+ // not taken branch
+ const uint32_t false_pc = curr_pc + 1;
+ assert(true_pc != false_pc);
+ if (false_pc != rpc && true_mask.count() < curr_mask.count()) {
+ VectorMask false_mask = curr_mask & ~true_mask;
+ w->pushToReconvergenceStack(false_pc, rpc, false_mask);
+ }
+
+ if (true_pc != rpc && true_mask.count()) {
+ w->pushToReconvergenceStack(true_pc, rpc, true_mask);
+ }
+ assert(w->pc() != curr_pc);
+ w->discardFetch();
+ }
+
+
+ class CbrDirectInst : public CbrInstBase<LabelOperand>
+ {
+ public:
+ CbrDirectInst(const Brig::BrigInstBase *ib, const BrigObject *obj)
+ : CbrInstBase<LabelOperand>(ib, obj)
+ {
+ }
+ // the source operand of a conditional branch is a Condition
+ // Register which is not stored in the VRF
+ // so we do not count it as a source-register operand
+ // even though, formally, it is one.
+ int numSrcRegOperands() { return 0; }
+ int numDstRegOperands() { return 0; }
+ };
+
+ class CbrIndirectInst : public CbrInstBase<SRegOperand>
+ {
+ public:
+ CbrIndirectInst(const Brig::BrigInstBase *ib, const BrigObject *obj)
+ : CbrInstBase<SRegOperand>(ib, obj)
+ {
+ }
+ // one source operand of the conditional indirect branch is a Condition
+ // register which is not stored in the VRF so we do not count it
+ // as a source-register operand even though, formally, it is one.
+ int numSrcRegOperands() { return target.isVectorRegister(); }
+ int numDstRegOperands() { return 0; }
+ };
+
+ GPUStaticInst* decodeCbr(const Brig::BrigInstBase *ib,
+ const BrigObject *obj);
+
+ template<typename TargetType>
+ class BrInstBase : public HsailGPUStaticInst
+ {
+ public:
+ void generateDisassembly();
+
+ ImmOperand<uint32_t> width;
+ TargetType target;
+
+ BrInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj)
+ : HsailGPUStaticInst(obj, "br")
+ {
+ o_type = Enums::OT_BRANCH;
+ width.init(((Brig::BrigInstBr *)ib)->width, obj);
+ unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+ target.init(op_offs, obj);
+ o_type = Enums::OT_BRANCH;
+ }
+
+ uint32_t getTargetPc() override { return target.getTarget(0, 0); }
+
+ bool unconditionalJumpInstruction() override { return true; }
+
+ void execute(GPUDynInstPtr gpuDynInst);
+ bool isVectorRegister(int operandIndex) {
+ assert(operandIndex >= 0 && operandIndex < getNumOperands());
+ return target.isVectorRegister();
+ }
+ bool isCondRegister(int operandIndex) {
+ assert(operandIndex >= 0 && operandIndex < getNumOperands());
+ return target.isCondRegister();
+ }
+ bool isScalarRegister(int operandIndex) {
+ assert(operandIndex >= 0 && operandIndex < getNumOperands());
+ return target.isScalarRegister();
+ }
+ bool isSrcOperand(int operandIndex) {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ return true;
+ }
+ bool isDstOperand(int operandIndex) { return false; }
+ int getOperandSize(int operandIndex) {
+ assert(operandIndex >= 0 && operandIndex < getNumOperands());
+ return target.opSize();
+ }
+ int getRegisterIndex(int operandIndex) {
+ assert(operandIndex >= 0 && operandIndex < getNumOperands());
+ return target.regIndex();
+ }
+ int getNumOperands() { return 1; }
+ };
+
+ template<typename TargetType>
+ void
+ BrInstBase<TargetType>::generateDisassembly()
+ {
+ std::string widthClause;
+
+ if (width.bits != 1) {
+ widthClause = csprintf("_width(%d)", width.bits);
+ }
+
+ disassembly = csprintf("%s%s %s", opcode, widthClause,
+ target.disassemble());
+ }
+
+ template<typename TargetType>
+ void
+ BrInstBase<TargetType>::execute(GPUDynInstPtr gpuDynInst)
+ {
+ Wavefront *w = gpuDynInst->wavefront();
+
+ if (getTargetPc() == w->rpc()) {
+ w->popFromReconvergenceStack();
+ } else {
+ // Rpc and execution mask remain the same
+ w->pc(getTargetPc());
+ }
+ w->discardFetch();
+ }
+
+ class BrDirectInst : public BrInstBase<LabelOperand>
+ {
+ public:
+ BrDirectInst(const Brig::BrigInstBase *ib, const BrigObject *obj)
+ : BrInstBase<LabelOperand>(ib, obj)
+ {
+ }
+
+ int numSrcRegOperands() { return 0; }
+ int numDstRegOperands() { return 0; }
+ };
+
+ class BrIndirectInst : public BrInstBase<SRegOperand>
+ {
+ public:
+ BrIndirectInst(const Brig::BrigInstBase *ib, const BrigObject *obj)
+ : BrInstBase<SRegOperand>(ib, obj)
+ {
+ }
+ int numSrcRegOperands() { return target.isVectorRegister(); }
+ int numDstRegOperands() { return 0; }
+ };
+
+ GPUStaticInst* decodeBr(const Brig::BrigInstBase *ib,
+ const BrigObject *obj);
+} // namespace HsailISA
+
+#endif // __ARCH_HSAIL_INSTS_BRANCH_HH__
diff --git a/src/arch/hsail/insts/decl.hh b/src/arch/hsail/insts/decl.hh
new file mode 100644
index 000000000..e2da501b9
--- /dev/null
+++ b/src/arch/hsail/insts/decl.hh
@@ -0,0 +1,1106 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#ifndef __ARCH_HSAIL_INSTS_DECL_HH__
+#define __ARCH_HSAIL_INSTS_DECL_HH__
+
+#include <cmath>
+
+#include "arch/hsail/generic_types.hh"
+#include "arch/hsail/insts/gpu_static_inst.hh"
+#include "arch/hsail/operand.hh"
+#include "debug/HSAIL.hh"
+#include "enums/OpType.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/shader.hh"
+
+namespace HsailISA
+{
+ template<typename _DestOperand, typename _SrcOperand>
+ class HsailOperandType
+ {
+ public:
+ typedef _DestOperand DestOperand;
+ typedef _SrcOperand SrcOperand;
+ };
+
+ typedef HsailOperandType<CRegOperand, CRegOrImmOperand> CRegOperandType;
+ typedef HsailOperandType<SRegOperand, SRegOrImmOperand> SRegOperandType;
+ typedef HsailOperandType<DRegOperand, DRegOrImmOperand> DRegOperandType;
+
+ // The IsBits parameter serves only to disambiguate tbhe B* types from
+ // the U* types, which otherwise would be identical (and
+ // indistinguishable).
+ template<typename _OperandType, typename _CType, Enums::MemType _memType,
+ vgpr_type _vgprType, int IsBits=0>
+ class HsailDataType
+ {
+ public:
+ typedef _OperandType OperandType;
+ typedef _CType CType;
+ static const Enums::MemType memType = _memType;
+ static const vgpr_type vgprType = _vgprType;
+ static const char *label;
+ };
+
+ typedef HsailDataType<CRegOperandType, bool, Enums::M_U8, VT_32, 1> B1;
+ typedef HsailDataType<SRegOperandType, uint8_t, Enums::M_U8, VT_32, 1> B8;
+
+ typedef HsailDataType<SRegOperandType, uint16_t,
+ Enums::M_U16, VT_32, 1> B16;
+
+ typedef HsailDataType<SRegOperandType, uint32_t,
+ Enums::M_U32, VT_32, 1> B32;
+
+ typedef HsailDataType<DRegOperandType, uint64_t,
+ Enums::M_U64, VT_64, 1> B64;
+
+ typedef HsailDataType<SRegOperandType, int8_t, Enums::M_S8, VT_32> S8;
+ typedef HsailDataType<SRegOperandType, int16_t, Enums::M_S16, VT_32> S16;
+ typedef HsailDataType<SRegOperandType, int32_t, Enums::M_S32, VT_32> S32;
+ typedef HsailDataType<DRegOperandType, int64_t, Enums::M_S64, VT_64> S64;
+
+ typedef HsailDataType<SRegOperandType, uint8_t, Enums::M_U8, VT_32> U8;
+ typedef HsailDataType<SRegOperandType, uint16_t, Enums::M_U16, VT_32> U16;
+ typedef HsailDataType<SRegOperandType, uint32_t, Enums::M_U32, VT_32> U32;
+ typedef HsailDataType<DRegOperandType, uint64_t, Enums::M_U64, VT_64> U64;
+
+ typedef HsailDataType<SRegOperandType, float, Enums::M_F32, VT_32> F32;
+ typedef HsailDataType<DRegOperandType, double, Enums::M_F64, VT_64> F64;
+
+ template<typename DestOperandType, typename SrcOperandType,
+ int NumSrcOperands>
+ class CommonInstBase : public HsailGPUStaticInst
+ {
+ protected:
+ typename DestOperandType::DestOperand dest;
+ typename SrcOperandType::SrcOperand src[NumSrcOperands];
+
+ void
+ generateDisassembly()
+ {
+ disassembly = csprintf("%s%s %s", opcode, opcode_suffix(),
+ dest.disassemble());
+
+ for (int i = 0; i < NumSrcOperands; ++i) {
+ disassembly += ",";
+ disassembly += src[i].disassemble();
+ }
+ }
+
+ virtual std::string opcode_suffix() = 0;
+
+ public:
+ CommonInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj,
+ const char *opcode)
+ : HsailGPUStaticInst(obj, opcode)
+ {
+ unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+
+ dest.init(op_offs, obj);
+
+ for (int i = 0; i < NumSrcOperands; ++i) {
+ op_offs = obj->getOperandPtr(ib->operands, i + 1);
+ src[i].init(op_offs, obj);
+ }
+ }
+
+ bool isVectorRegister(int operandIndex) {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if (operandIndex < NumSrcOperands)
+ return src[operandIndex].isVectorRegister();
+ else
+ return dest.isVectorRegister();
+ }
+ bool isCondRegister(int operandIndex) {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if (operandIndex < NumSrcOperands)
+ return src[operandIndex].isCondRegister();
+ else
+ return dest.isCondRegister();
+ }
+ bool isScalarRegister(int operandIndex) {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if (operandIndex < NumSrcOperands)
+ return src[operandIndex].isScalarRegister();
+ else
+ return dest.isScalarRegister();
+ }
+ bool isSrcOperand(int operandIndex) {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if (operandIndex < NumSrcOperands)
+ return true;
+ return false;
+ }
+
+ bool isDstOperand(int operandIndex) {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if (operandIndex >= NumSrcOperands)
+ return true;
+ return false;
+ }
+ int getOperandSize(int operandIndex) {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if (operandIndex < NumSrcOperands)
+ return src[operandIndex].opSize();
+ else
+ return dest.opSize();
+ }
+ int getRegisterIndex(int operandIndex) {
+ assert(operandIndex >= 0 && operandIndex < getNumOperands());
+
+ if (operandIndex < NumSrcOperands)
+ return src[operandIndex].regIndex();
+ else
+ return dest.regIndex();
+ }
+ int numSrcRegOperands() {
+ int operands = 0;
+ for (int i = 0; i < NumSrcOperands; i++) {
+ if (src[i].isVectorRegister() == true) {
+ operands++;
+ }
+ }
+ return operands;
+ }
+ int numDstRegOperands() { return dest.isVectorRegister(); }
+ int getNumOperands() { return NumSrcOperands + 1; }
+ };
+
+ template<typename DataType, int NumSrcOperands>
+ class ArithInst : public CommonInstBase<typename DataType::OperandType,
+ typename DataType::OperandType,
+ NumSrcOperands>
+ {
+ public:
+ std::string opcode_suffix() { return csprintf("_%s", DataType::label); }
+
+ ArithInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
+ const char *opcode)
+ : CommonInstBase<typename DataType::OperandType,
+ typename DataType::OperandType,
+ NumSrcOperands>(ib, obj, opcode)
+ {
+ }
+ };
+
+ template<typename DestOperandType, typename Src0OperandType,
+ typename Src1OperandType, typename Src2OperandType>
+ class ThreeNonUniformSourceInstBase : public HsailGPUStaticInst
+ {
+ protected:
+ typename DestOperandType::DestOperand dest;
+ typename Src0OperandType::SrcOperand src0;
+ typename Src1OperandType::SrcOperand src1;
+ typename Src2OperandType::SrcOperand src2;
+
+ void
+ generateDisassembly()
+ {
+ disassembly = csprintf("%s %s,%s,%s,%s", opcode, dest.disassemble(),
+ src0.disassemble(), src1.disassemble(),
+ src2.disassemble());
+ }
+
+ public:
+ ThreeNonUniformSourceInstBase(const Brig::BrigInstBase *ib,
+ const BrigObject *obj,
+ const char *opcode)
+ : HsailGPUStaticInst(obj, opcode)
+ {
+ unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+ dest.init(op_offs, obj);
+
+ op_offs = obj->getOperandPtr(ib->operands, 1);
+ src0.init(op_offs, obj);
+
+ op_offs = obj->getOperandPtr(ib->operands, 2);
+ src1.init(op_offs, obj);
+
+ op_offs = obj->getOperandPtr(ib->operands, 3);
+ src2.init(op_offs, obj);
+ }
+
+ bool isVectorRegister(int operandIndex) {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if (!operandIndex)
+ return src0.isVectorRegister();
+ else if (operandIndex == 1)
+ return src1.isVectorRegister();
+ else if (operandIndex == 2)
+ return src2.isVectorRegister();
+ else
+ return dest.isVectorRegister();
+ }
+ bool isCondRegister(int operandIndex) {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if (!operandIndex)
+ return src0.isCondRegister();
+ else if (operandIndex == 1)
+ return src1.isCondRegister();
+ else if (operandIndex == 2)
+ return src2.isCondRegister();
+ else
+ return dest.isCondRegister();
+ }
+ bool isScalarRegister(int operandIndex) {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if (!operandIndex)
+ return src0.isScalarRegister();
+ else if (operandIndex == 1)
+ return src1.isScalarRegister();
+ else if (operandIndex == 2)
+ return src2.isScalarRegister();
+ else
+ return dest.isScalarRegister();
+ }
+ bool isSrcOperand(int operandIndex) {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if (operandIndex < 3)
+ return true;
+ else
+ return false;
+ }
+ bool isDstOperand(int operandIndex) {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if (operandIndex >= 3)
+ return true;
+ else
+ return false;
+ }
+ int getOperandSize(int operandIndex) {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if (!operandIndex)
+ return src0.opSize();
+ else if (operandIndex == 1)
+ return src1.opSize();
+ else if (operandIndex == 2)
+ return src2.opSize();
+ else
+ return dest.opSize();
+ }
+ int getRegisterIndex(int operandIndex) {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if (!operandIndex)
+ return src0.regIndex();
+ else if (operandIndex == 1)
+ return src1.regIndex();
+ else if (operandIndex == 2)
+ return src2.regIndex();
+ else
+ return dest.regIndex();
+ }
+
+ int numSrcRegOperands() {
+ int operands = 0;
+ if (src0.isVectorRegister() == true) {
+ operands++;
+ }
+ if (src1.isVectorRegister() == true) {
+ operands++;
+ }
+ if (src2.isVectorRegister() == true) {
+ operands++;
+ }
+ return operands;
+ }
+ int numDstRegOperands() { return dest.isVectorRegister(); }
+ int getNumOperands() { return 4; }
+ };
+
+ template<typename DestDataType, typename Src0DataType,
+ typename Src1DataType, typename Src2DataType>
+ class ThreeNonUniformSourceInst :
+ public ThreeNonUniformSourceInstBase<typename DestDataType::OperandType,
+ typename Src0DataType::OperandType,
+ typename Src1DataType::OperandType,
+ typename Src2DataType::OperandType>
+ {
+ public:
+ typedef typename DestDataType::CType DestCType;
+ typedef typename Src0DataType::CType Src0CType;
+ typedef typename Src1DataType::CType Src1CType;
+ typedef typename Src2DataType::CType Src2CType;
+
+ ThreeNonUniformSourceInst(const Brig::BrigInstBase *ib,
+ const BrigObject *obj, const char *opcode)
+ : ThreeNonUniformSourceInstBase<typename DestDataType::OperandType,
+ typename Src0DataType::OperandType,
+ typename Src1DataType::OperandType,
+ typename Src2DataType::OperandType>(ib,
+ obj, opcode)
+ {
+ }
+ };
+
+ template<typename DataType>
+ class CmovInst : public ThreeNonUniformSourceInst<DataType, B1,
+ DataType, DataType>
+ {
+ public:
+ CmovInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
+ const char *opcode)
+ : ThreeNonUniformSourceInst<DataType, B1, DataType,
+ DataType>(ib, obj, opcode)
+ {
+ }
+ };
+
+ template<typename DataType>
+ class ExtractInsertInst : public ThreeNonUniformSourceInst<DataType,
+ DataType, U32,
+ U32>
+ {
+ public:
+ ExtractInsertInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
+ const char *opcode)
+ : ThreeNonUniformSourceInst<DataType, DataType, U32,
+ U32>(ib, obj, opcode)
+ {
+ }
+ };
+
+ template<typename DestOperandType, typename Src0OperandType,
+ typename Src1OperandType>
+ class TwoNonUniformSourceInstBase : public HsailGPUStaticInst
+ {
+ protected:
+ typename DestOperandType::DestOperand dest;
+ typename Src0OperandType::SrcOperand src0;
+ typename Src1OperandType::SrcOperand src1;
+
+ void
+ generateDisassembly()
+ {
+ disassembly = csprintf("%s %s,%s,%s", opcode, dest.disassemble(),
+ src0.disassemble(), src1.disassemble());
+ }
+
+
+ public:
+ TwoNonUniformSourceInstBase(const Brig::BrigInstBase *ib,
+ const BrigObject *obj, const char *opcode)
+ : HsailGPUStaticInst(obj, opcode)
+ {
+ unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+ dest.init(op_offs, obj);
+
+ op_offs = obj->getOperandPtr(ib->operands, 1);
+ src0.init(op_offs, obj);
+
+ op_offs = obj->getOperandPtr(ib->operands, 2);
+ src1.init(op_offs, obj);
+ }
+ bool isVectorRegister(int operandIndex) {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if (!operandIndex)
+ return src0.isVectorRegister();
+ else if (operandIndex == 1)
+ return src1.isVectorRegister();
+ else
+ return dest.isVectorRegister();
+ }
+ bool isCondRegister(int operandIndex) {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if (!operandIndex)
+ return src0.isCondRegister();
+ else if (operandIndex == 1)
+ return src1.isCondRegister();
+ else
+ return dest.isCondRegister();
+ }
+ bool isScalarRegister(int operandIndex) {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if (!operandIndex)
+ return src0.isScalarRegister();
+ else if (operandIndex == 1)
+ return src1.isScalarRegister();
+ else
+ return dest.isScalarRegister();
+ }
+ bool isSrcOperand(int operandIndex) {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if (operandIndex < 2)
+ return true;
+ else
+ return false;
+ }
+ bool isDstOperand(int operandIndex) {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if (operandIndex >= 2)
+ return true;
+ else
+ return false;
+ }
+ int getOperandSize(int operandIndex) {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if (!operandIndex)
+ return src0.opSize();
+ else if (operandIndex == 1)
+ return src1.opSize();
+ else
+ return dest.opSize();
+ }
+ int getRegisterIndex(int operandIndex) {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if (!operandIndex)
+ return src0.regIndex();
+ else if (operandIndex == 1)
+ return src1.regIndex();
+ else
+ return dest.regIndex();
+ }
+
+ int numSrcRegOperands() {
+ int operands = 0;
+ if (src0.isVectorRegister() == true) {
+ operands++;
+ }
+ if (src1.isVectorRegister() == true) {
+ operands++;
+ }
+ return operands;
+ }
+ int numDstRegOperands() { return dest.isVectorRegister(); }
+ int getNumOperands() { return 3; }
+ };
+
+ template<typename DestDataType, typename Src0DataType,
+ typename Src1DataType>
+ class TwoNonUniformSourceInst :
+ public TwoNonUniformSourceInstBase<typename DestDataType::OperandType,
+ typename Src0DataType::OperandType,
+ typename Src1DataType::OperandType>
+ {
+ public:
+ typedef typename DestDataType::CType DestCType;
+ typedef typename Src0DataType::CType Src0CType;
+ typedef typename Src1DataType::CType Src1CType;
+
+ TwoNonUniformSourceInst(const Brig::BrigInstBase *ib,
+ const BrigObject *obj, const char *opcode)
+ : TwoNonUniformSourceInstBase<typename DestDataType::OperandType,
+ typename Src0DataType::OperandType,
+ typename Src1DataType::OperandType>(ib,
+ obj, opcode)
+ {
+ }
+ };
+
+ // helper function for ClassInst
+ template<typename T>
+ bool
+ fpclassify(T src0, uint32_t src1)
+ {
+ int fpclass = std::fpclassify(src0);
+
+ if ((src1 & 0x3) && (fpclass == FP_NAN)) {
+ return true;
+ }
+
+ if (src0 <= -0.0) {
+ if ((src1 & 0x4) && fpclass == FP_INFINITE)
+ return true;
+ if ((src1 & 0x8) && fpclass == FP_NORMAL)
+ return true;
+ if ((src1 & 0x10) && fpclass == FP_SUBNORMAL)
+ return true;
+ if ((src1 & 0x20) && fpclass == FP_ZERO)
+ return true;
+ } else {
+ if ((src1 & 0x40) && fpclass == FP_ZERO)
+ return true;
+ if ((src1 & 0x80) && fpclass == FP_SUBNORMAL)
+ return true;
+ if ((src1 & 0x100) && fpclass == FP_NORMAL)
+ return true;
+ if ((src1 & 0x200) && fpclass == FP_INFINITE)
+ return true;
+ }
+ return false;
+ }
+
+ template<typename DataType>
+ class ClassInst : public TwoNonUniformSourceInst<B1, DataType, U32>
+ {
+ public:
+ ClassInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
+ const char *opcode)
+ : TwoNonUniformSourceInst<B1, DataType, U32>(ib, obj, opcode)
+ {
+ }
+ };
+
+ template<typename DataType>
+ class ShiftInst : public TwoNonUniformSourceInst<DataType, DataType, U32>
+ {
+ public:
+ ShiftInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
+ const char *opcode)
+ : TwoNonUniformSourceInst<DataType, DataType, U32>(ib, obj, opcode)
+ {
+ }
+ };
+
+ // helper function for CmpInst
+ template<typename T>
+ bool
+ compare(T src0, T src1, Brig::BrigCompareOperation cmpOp)
+ {
+ using namespace Brig;
+
+ switch (cmpOp) {
+ case BRIG_COMPARE_EQ:
+ case BRIG_COMPARE_EQU:
+ case BRIG_COMPARE_SEQ:
+ case BRIG_COMPARE_SEQU:
+ return (src0 == src1);
+
+ case BRIG_COMPARE_NE:
+ case BRIG_COMPARE_NEU:
+ case BRIG_COMPARE_SNE:
+ case BRIG_COMPARE_SNEU:
+ return (src0 != src1);
+
+ case BRIG_COMPARE_LT:
+ case BRIG_COMPARE_LTU:
+ case BRIG_COMPARE_SLT:
+ case BRIG_COMPARE_SLTU:
+ return (src0 < src1);
+
+ case BRIG_COMPARE_LE:
+ case BRIG_COMPARE_LEU:
+ case BRIG_COMPARE_SLE:
+ case BRIG_COMPARE_SLEU:
+ return (src0 <= src1);
+
+ case BRIG_COMPARE_GT:
+ case BRIG_COMPARE_GTU:
+ case BRIG_COMPARE_SGT:
+ case BRIG_COMPARE_SGTU:
+ return (src0 > src1);
+
+ case BRIG_COMPARE_GE:
+ case BRIG_COMPARE_GEU:
+ case BRIG_COMPARE_SGE:
+ case BRIG_COMPARE_SGEU:
+ return (src0 >= src1);
+
+ case BRIG_COMPARE_NUM:
+ case BRIG_COMPARE_SNUM:
+ return (src0 == src0) || (src1 == src1);
+
+ case BRIG_COMPARE_NAN:
+ case BRIG_COMPARE_SNAN:
+ return (src0 != src0) || (src1 != src1);
+
+ default:
+ fatal("Bad cmpOp value %d\n", (int)cmpOp);
+ }
+ }
+
+ template<typename T>
+ int32_t
+ firstbit(T src0)
+ {
+ if (!src0)
+ return -1;
+
+ //handle positive and negative numbers
+ T tmp = (src0 < 0) ? (~src0) : (src0);
+
+ //the starting pos is MSB
+ int pos = 8 * sizeof(T) - 1;
+ int cnt = 0;
+
+ //search the first bit set to 1
+ while (!(tmp & (1 << pos))) {
+ ++cnt;
+ --pos;
+ }
+ return cnt;
+ }
+
+ const char* cmpOpToString(Brig::BrigCompareOperation cmpOp);
+
+ template<typename DestOperandType, typename SrcOperandType>
+ class CmpInstBase : public CommonInstBase<DestOperandType, SrcOperandType,
+ 2>
+ {
+ protected:
+ Brig::BrigCompareOperation cmpOp;
+
+ public:
+ CmpInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj,
+ const char *_opcode)
+ : CommonInstBase<DestOperandType, SrcOperandType, 2>(ib, obj,
+ _opcode)
+ {
+ assert(ib->base.kind == Brig::BRIG_KIND_INST_CMP);
+ Brig::BrigInstCmp *i = (Brig::BrigInstCmp*)ib;
+ cmpOp = (Brig::BrigCompareOperation)i->compare;
+ }
+ };
+
+ template<typename DestDataType, typename SrcDataType>
+ class CmpInst : public CmpInstBase<typename DestDataType::OperandType,
+ typename SrcDataType::OperandType>
+ {
+ public:
+ std::string
+ opcode_suffix()
+ {
+ return csprintf("_%s_%s_%s", cmpOpToString(this->cmpOp),
+ DestDataType::label, SrcDataType::label);
+ }
+
+ CmpInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
+ const char *_opcode)
+ : CmpInstBase<typename DestDataType::OperandType,
+ typename SrcDataType::OperandType>(ib, obj, _opcode)
+ {
+ }
+ };
+
+ template<typename DestDataType, typename SrcDataType>
+ class CvtInst : public CommonInstBase<typename DestDataType::OperandType,
+ typename SrcDataType::OperandType, 1>
+ {
+ public:
+ std::string opcode_suffix()
+ {
+ return csprintf("_%s_%s", DestDataType::label, SrcDataType::label);
+ }
+
+ CvtInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
+ const char *_opcode)
+ : CommonInstBase<typename DestDataType::OperandType,
+ typename SrcDataType::OperandType,
+ 1>(ib, obj, _opcode)
+ {
+ }
+ };
+
+ class SpecialInstNoSrcNoDest : public HsailGPUStaticInst
+ {
+ public:
+ SpecialInstNoSrcNoDest(const Brig::BrigInstBase *ib,
+ const BrigObject *obj, const char *_opcode)
+ : HsailGPUStaticInst(obj, _opcode)
+ {
+ }
+
+ bool isVectorRegister(int operandIndex) { return false; }
+ bool isCondRegister(int operandIndex) { return false; }
+ bool isScalarRegister(int operandIndex) { return false; }
+ bool isSrcOperand(int operandIndex) { return false; }
+ bool isDstOperand(int operandIndex) { return false; }
+ int getOperandSize(int operandIndex) { return 0; }
+ int getRegisterIndex(int operandIndex) { return -1; }
+
+ int numSrcRegOperands() { return 0; }
+ int numDstRegOperands() { return 0; }
+ int getNumOperands() { return 0; }
+ };
+
+ template<typename DestOperandType>
+ class SpecialInstNoSrcBase : public HsailGPUStaticInst
+ {
+ protected:
+ typename DestOperandType::DestOperand dest;
+
+ void generateDisassembly()
+ {
+ disassembly = csprintf("%s %s", opcode, dest.disassemble());
+ }
+
+ public:
+ SpecialInstNoSrcBase(const Brig::BrigInstBase *ib,
+ const BrigObject *obj, const char *_opcode)
+ : HsailGPUStaticInst(obj, _opcode)
+ {
+ unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+ dest.init(op_offs, obj);
+ }
+
+ bool isVectorRegister(int operandIndex) {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ return dest.isVectorRegister();
+ }
+ bool isCondRegister(int operandIndex) {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ return dest.isCondRegister();
+ }
+ bool isScalarRegister(int operandIndex) {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ return dest.isScalarRegister();
+ }
+ bool isSrcOperand(int operandIndex) { return false; }
+ bool isDstOperand(int operandIndex) { return true; }
+ int getOperandSize(int operandIndex) {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ return dest.opSize();
+ }
+ int getRegisterIndex(int operandIndex) {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ return dest.regIndex();
+ }
+ int numSrcRegOperands() { return 0; }
+ int numDstRegOperands() { return dest.isVectorRegister(); }
+ int getNumOperands() { return 1; }
+ };
+
+ template<typename DestDataType>
+ class SpecialInstNoSrc :
+ public SpecialInstNoSrcBase<typename DestDataType::OperandType>
+ {
+ public:
+ typedef typename DestDataType::CType DestCType;
+
+ SpecialInstNoSrc(const Brig::BrigInstBase *ib, const BrigObject *obj,
+ const char *_opcode)
+ : SpecialInstNoSrcBase<typename DestDataType::OperandType>(ib, obj,
+ _opcode)
+ {
+ }
+ };
+
+ template<typename DestOperandType>
+ class SpecialInst1SrcBase : public HsailGPUStaticInst
+ {
+ protected:
+ typedef int SrcCType; // used in execute() template
+
+ typename DestOperandType::DestOperand dest;
+ ImmOperand<SrcCType> src0;
+
+ void
+ generateDisassembly()
+ {
+ disassembly = csprintf("%s %s,%s", opcode, dest.disassemble(),
+ src0.disassemble());
+ }
+
+ public:
+ SpecialInst1SrcBase(const Brig::BrigInstBase *ib,
+ const BrigObject *obj, const char *_opcode)
+ : HsailGPUStaticInst(obj, _opcode)
+ {
+ unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+ dest.init(op_offs, obj);
+
+ op_offs = obj->getOperandPtr(ib->operands, 1);
+ src0.init(op_offs, obj);
+ }
+ bool isVectorRegister(int operandIndex) {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ return dest.isVectorRegister();
+ }
+ bool isCondRegister(int operandIndex) {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ return dest.isCondRegister();
+ }
+ bool isScalarRegister(int operandIndex) {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ return dest.isScalarRegister();
+ }
+ bool isSrcOperand(int operandIndex) { return false; }
+ bool isDstOperand(int operandIndex) { return true; }
+ int getOperandSize(int operandIndex) {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ return dest.opSize();
+ }
+ int getRegisterIndex(int operandIndex) {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ return dest.regIndex();
+ }
+ int numSrcRegOperands() { return 0; }
+ int numDstRegOperands() { return dest.isVectorRegister(); }
+ int getNumOperands() { return 1; }
+ };
+
+ template<typename DestDataType>
+ class SpecialInst1Src :
+ public SpecialInst1SrcBase<typename DestDataType::OperandType>
+ {
+ public:
+ typedef typename DestDataType::CType DestCType;
+
+ SpecialInst1Src(const Brig::BrigInstBase *ib, const BrigObject *obj,
+ const char *_opcode)
+ : SpecialInst1SrcBase<typename DestDataType::OperandType>(ib, obj,
+ _opcode)
+ {
+ }
+ };
+
+ class Ret : public SpecialInstNoSrcNoDest
+ {
+ public:
+ typedef SpecialInstNoSrcNoDest Base;
+
+ Ret(const Brig::BrigInstBase *ib, const BrigObject *obj)
+ : Base(ib, obj, "ret")
+ {
+ o_type = Enums::OT_RET;
+ }
+
+ void execute(GPUDynInstPtr gpuDynInst);
+ };
+
+ class Barrier : public SpecialInstNoSrcNoDest
+ {
+ public:
+ typedef SpecialInstNoSrcNoDest Base;
+ uint8_t width;
+
+ Barrier(const Brig::BrigInstBase *ib, const BrigObject *obj)
+ : Base(ib, obj, "barrier")
+ {
+ o_type = Enums::OT_BARRIER;
+ assert(ib->base.kind == Brig::BRIG_KIND_INST_BR);
+ width = (uint8_t)((Brig::BrigInstBr*)ib)->width;
+ }
+
+ void execute(GPUDynInstPtr gpuDynInst);
+ };
+
+ class MemFence : public SpecialInstNoSrcNoDest
+ {
+ public:
+ typedef SpecialInstNoSrcNoDest Base;
+
+ Brig::BrigMemoryOrder memFenceMemOrder;
+ Brig::BrigMemoryScope memFenceScopeSegGroup;
+ Brig::BrigMemoryScope memFenceScopeSegGlobal;
+ Brig::BrigMemoryScope memFenceScopeSegImage;
+
+ MemFence(const Brig::BrigInstBase *ib, const BrigObject *obj)
+ : Base(ib, obj, "memfence")
+ {
+ assert(ib->base.kind == Brig::BRIG_KIND_INST_MEM_FENCE);
+
+ memFenceScopeSegGlobal = (Brig::BrigMemoryScope)
+ ((Brig::BrigInstMemFence*)ib)->globalSegmentMemoryScope;
+
+ memFenceScopeSegGroup = (Brig::BrigMemoryScope)
+ ((Brig::BrigInstMemFence*)ib)->groupSegmentMemoryScope;
+
+ memFenceScopeSegImage = (Brig::BrigMemoryScope)
+ ((Brig::BrigInstMemFence*)ib)->imageSegmentMemoryScope;
+
+ memFenceMemOrder = (Brig::BrigMemoryOrder)
+ ((Brig::BrigInstMemFence*)ib)->memoryOrder;
+
+ // set o_type based on scopes
+ if (memFenceScopeSegGlobal != Brig::BRIG_MEMORY_SCOPE_NONE &&
+ memFenceScopeSegGroup != Brig::BRIG_MEMORY_SCOPE_NONE) {
+ o_type = Enums::OT_BOTH_MEMFENCE;
+ } else if (memFenceScopeSegGlobal != Brig::BRIG_MEMORY_SCOPE_NONE) {
+ o_type = Enums::OT_GLOBAL_MEMFENCE;
+ } else if (memFenceScopeSegGroup != Brig::BRIG_MEMORY_SCOPE_NONE) {
+ o_type = Enums::OT_SHARED_MEMFENCE;
+ } else {
+ fatal("MemFence constructor: bad scope specifiers\n");
+ }
+ }
+
+ void
+ initiateAcc(GPUDynInstPtr gpuDynInst)
+ {
+ Wavefront *wave = gpuDynInst->wavefront();
+ wave->computeUnit->injectGlobalMemFence(gpuDynInst);
+ }
+
+ void
+ execute(GPUDynInstPtr gpuDynInst)
+ {
+ Wavefront *w = gpuDynInst->wavefront();
+ // 2 cases:
+ // * memfence to a sequentially consistent memory (e.g., LDS).
+ // These can be handled as no-ops.
+ // * memfence to a relaxed consistency cache (e.g., Hermes, Viper,
+ // etc.). We send a packet, tagged with the memory order and
+ // scope, and let the GPU coalescer handle it.
+
+ if (o_type == Enums::OT_GLOBAL_MEMFENCE ||
+ o_type == Enums::OT_BOTH_MEMFENCE) {
+ gpuDynInst->simdId = w->simdId;
+ gpuDynInst->wfSlotId = w->wfSlotId;
+ gpuDynInst->wfDynId = w->wfDynId;
+ gpuDynInst->kern_id = w->kern_id;
+ gpuDynInst->cu_id = w->computeUnit->cu_id;
+
+ gpuDynInst->memoryOrder =
+ getGenericMemoryOrder(memFenceMemOrder);
+ gpuDynInst->scope =
+ getGenericMemoryScope(memFenceScopeSegGlobal);
+ gpuDynInst->useContinuation = false;
+ GlobalMemPipeline* gmp = &(w->computeUnit->globalMemoryPipe);
+ gmp->getGMReqFIFO().push(gpuDynInst);
+
+ w->wr_gm_reqs_in_pipe--;
+ w->rd_gm_reqs_in_pipe--;
+ w->mem_reqs_in_pipe--;
+ w->outstanding_reqs++;
+ } else if (o_type == Enums::OT_SHARED_MEMFENCE) {
+ // no-op
+ } else {
+ fatal("MemFence execute: bad o_type\n");
+ }
+ }
+ };
+
+ class Call : public HsailGPUStaticInst
+ {
+ public:
+ // private helper functions
+ void calcAddr(Wavefront* w, GPUDynInstPtr m);
+
+ void
+ generateDisassembly()
+ {
+ if (dest.disassemble() == "") {
+ disassembly = csprintf("%s %s (%s)", opcode, src0.disassemble(),
+ src1.disassemble());
+ } else {
+ disassembly = csprintf("%s %s (%s) (%s)", opcode,
+ src0.disassemble(), dest.disassemble(),
+ src1.disassemble());
+ }
+ }
+
+ bool
+ isPseudoOp()
+ {
+ std::string func_name = src0.disassemble();
+ if (func_name.find("__gem5_hsail_op") != std::string::npos) {
+ return true;
+ }
+ return false;
+ }
+
+ // member variables
+ ListOperand dest;
+ FunctionRefOperand src0;
+ ListOperand src1;
+ HsailCode *func_ptr;
+
+ // exec function for pseudo instructions mapped on top of call opcode
+ void execPseudoInst(Wavefront *w, GPUDynInstPtr gpuDynInst);
+
+ // user-defined pseudo instructions
+ void MagicPrintLane(Wavefront *w);
+ void MagicPrintLane64(Wavefront *w);
+ void MagicPrintWF32(Wavefront *w);
+ void MagicPrintWF64(Wavefront *w);
+ void MagicPrintWFFloat(Wavefront *w);
+ void MagicSimBreak(Wavefront *w);
+ void MagicPrefixSum(Wavefront *w);
+ void MagicReduction(Wavefront *w);
+ void MagicMaskLower(Wavefront *w);
+ void MagicMaskUpper(Wavefront *w);
+ void MagicJoinWFBar(Wavefront *w);
+ void MagicWaitWFBar(Wavefront *w);
+ void MagicPanic(Wavefront *w);
+
+ void MagicAtomicNRAddGlobalU32Reg(Wavefront *w,
+ GPUDynInstPtr gpuDynInst);
+
+ void MagicAtomicNRAddGroupU32Reg(Wavefront *w,
+ GPUDynInstPtr gpuDynInst);
+
+ void MagicLoadGlobalU32Reg(Wavefront *w, GPUDynInstPtr gpuDynInst);
+
+ void MagicXactCasLd(Wavefront *w);
+ void MagicMostSigThread(Wavefront *w);
+ void MagicMostSigBroadcast(Wavefront *w);
+
+ void MagicPrintWF32ID(Wavefront *w);
+ void MagicPrintWFID64(Wavefront *w);
+
+ Call(const Brig::BrigInstBase *ib, const BrigObject *obj)
+ : HsailGPUStaticInst(obj, "call")
+ {
+ unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+ dest.init(op_offs, obj);
+ op_offs = obj->getOperandPtr(ib->operands, 1);
+ src0.init(op_offs, obj);
+
+ func_ptr = nullptr;
+ std::string func_name = src0.disassemble();
+ if (!isPseudoOp()) {
+ func_ptr = dynamic_cast<HsailCode*>(obj->
+ getFunction(func_name));
+
+ if (!func_ptr)
+ fatal("call::exec cannot find function: %s\n", func_name);
+ }
+
+ op_offs = obj->getOperandPtr(ib->operands, 2);
+ src1.init(op_offs, obj);
+ }
+
+ bool isVectorRegister(int operandIndex) { return false; }
+ bool isCondRegister(int operandIndex) { return false; }
+ bool isScalarRegister(int operandIndex) { return false; }
+ bool isSrcOperand(int operandIndex) { return false; }
+ bool isDstOperand(int operandIndex) { return false; }
+ int getOperandSize(int operandIndex) { return 0; }
+ int getRegisterIndex(int operandIndex) { return -1; }
+
+ void
+ execute(GPUDynInstPtr gpuDynInst)
+ {
+ Wavefront *w = gpuDynInst->wavefront();
+
+ std::string func_name = src0.disassemble();
+ if (isPseudoOp()) {
+ execPseudoInst(w, gpuDynInst);
+ } else {
+ fatal("Native HSAIL functions are not yet implemented: %s\n",
+ func_name);
+ }
+ }
+ int numSrcRegOperands() { return 0; }
+ int numDstRegOperands() { return 0; }
+ int getNumOperands() { return 2; }
+ };
+
+ template<typename T> T heynot(T arg) { return ~arg; }
+ template<> inline bool heynot<bool>(bool arg) { return !arg; }
+} // namespace HsailISA
+
+#endif // __ARCH_HSAIL_INSTS_DECL_HH__
diff --git a/src/arch/hsail/insts/gpu_static_inst.cc b/src/arch/hsail/insts/gpu_static_inst.cc
new file mode 100644
index 000000000..bbaeb13e6
--- /dev/null
+++ b/src/arch/hsail/insts/gpu_static_inst.cc
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#include "arch/hsail/insts/gpu_static_inst.hh"
+
+#include "gpu-compute/brig_object.hh"
+
+namespace HsailISA
+{
+ HsailGPUStaticInst::HsailGPUStaticInst(const BrigObject *obj,
+ const std::string &opcode)
+ : GPUStaticInst(opcode), hsailCode(obj->currentCode)
+ {
+ }
+
+ void
+ HsailGPUStaticInst::generateDisassembly()
+ {
+ disassembly = opcode;
+ }
+
+ const std::string&
+ HsailGPUStaticInst::disassemble()
+ {
+ if (disassembly.empty()) {
+ generateDisassembly();
+ assert(!disassembly.empty());
+ }
+
+ return disassembly;
+ }
+} // namespace HsailISA
diff --git a/src/arch/hsail/insts/gpu_static_inst.hh b/src/arch/hsail/insts/gpu_static_inst.hh
new file mode 100644
index 000000000..29aab1f70
--- /dev/null
+++ b/src/arch/hsail/insts/gpu_static_inst.hh
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#ifndef __ARCH_HSAIL_INSTS_GPU_STATIC_INST_HH__
+#define __ARCH_HSAIL_INSTS_GPU_STATIC_INST_HH__
+
+/*
+ * @file gpu_static_inst.hh
+ *
+ * Defines the base class representing HSAIL GPU static instructions.
+ */
+
+#include "gpu-compute/gpu_static_inst.hh"
+
+class BrigObject;
+class HsailCode;
+
+namespace HsailISA
+{
+ class HsailGPUStaticInst : public GPUStaticInst
+ {
+ public:
+ HsailGPUStaticInst(const BrigObject *obj, const std::string &opcode);
+ void generateDisassembly();
+ const std::string &disassemble();
+ uint32_t instSize() { return 4; }
+
+ protected:
+ HsailCode *hsailCode;
+ };
+} // namespace HsailISA
+
+#endif // __ARCH_HSAIL_INSTS_GPU_STATIC_INST_HH__
diff --git a/src/arch/hsail/insts/main.cc b/src/arch/hsail/insts/main.cc
new file mode 100644
index 000000000..4e70bf46a
--- /dev/null
+++ b/src/arch/hsail/insts/main.cc
@@ -0,0 +1,208 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#include "arch/hsail/insts/decl.hh"
+#include "debug/GPUExec.hh"
+#include "gpu-compute/dispatcher.hh"
+#include "gpu-compute/simple_pool_manager.hh"
+
+namespace HsailISA
+{
+ template<> const char *B1::label = "b1";
+ template<> const char *B8::label = "b8";
+ template<> const char *B16::label = "b16";
+ template<> const char *B32::label = "b32";
+ template<> const char *B64::label = "b64";
+
+ template<> const char *S8::label = "s8";
+ template<> const char *S16::label = "s16";
+ template<> const char *S32::label = "s32";
+ template<> const char *S64::label = "s64";
+
+ template<> const char *U8::label = "u8";
+ template<> const char *U16::label = "u16";
+ template<> const char *U32::label = "u32";
+ template<> const char *U64::label = "u64";
+
+ template<> const char *F32::label = "f32";
+ template<> const char *F64::label = "f64";
+
+ const char*
+ cmpOpToString(Brig::BrigCompareOperation cmpOp)
+ {
+ using namespace Brig;
+
+ switch (cmpOp) {
+ case BRIG_COMPARE_EQ:
+ return "eq";
+ case BRIG_COMPARE_NE:
+ return "ne";
+ case BRIG_COMPARE_LT:
+ return "lt";
+ case BRIG_COMPARE_LE:
+ return "le";
+ case BRIG_COMPARE_GT:
+ return "gt";
+ case BRIG_COMPARE_GE:
+ return "ge";
+ case BRIG_COMPARE_EQU:
+ return "equ";
+ case BRIG_COMPARE_NEU:
+ return "neu";
+ case BRIG_COMPARE_LTU:
+ return "ltu";
+ case BRIG_COMPARE_LEU:
+ return "leu";
+ case BRIG_COMPARE_GTU:
+ return "gtu";
+ case BRIG_COMPARE_GEU:
+ return "geu";
+ case BRIG_COMPARE_NUM:
+ return "num";
+ case BRIG_COMPARE_NAN:
+ return "nan";
+ case BRIG_COMPARE_SEQ:
+ return "seq";
+ case BRIG_COMPARE_SNE:
+ return "sne";
+ case BRIG_COMPARE_SLT:
+ return "slt";
+ case BRIG_COMPARE_SLE:
+ return "sle";
+ case BRIG_COMPARE_SGT:
+ return "sgt";
+ case BRIG_COMPARE_SGE:
+ return "sge";
+ case BRIG_COMPARE_SGEU:
+ return "sgeu";
+ case BRIG_COMPARE_SEQU:
+ return "sequ";
+ case BRIG_COMPARE_SNEU:
+ return "sneu";
+ case BRIG_COMPARE_SLTU:
+ return "sltu";
+ case BRIG_COMPARE_SLEU:
+ return "sleu";
+ case BRIG_COMPARE_SNUM:
+ return "snum";
+ case BRIG_COMPARE_SNAN:
+ return "snan";
+ case BRIG_COMPARE_SGTU:
+ return "sgtu";
+ default:
+ return "unknown";
+ }
+ }
+
+ void
+ Ret::execute(GPUDynInstPtr gpuDynInst)
+ {
+ Wavefront *w = gpuDynInst->wavefront();
+
+ const VectorMask &mask = w->get_pred();
+
+ // mask off completed work-items
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (mask[lane]) {
+ w->init_mask[lane] = 0;
+ }
+
+ }
+
+ // delete extra instructions fetched for completed work-items
+ w->instructionBuffer.erase(w->instructionBuffer.begin() + 1,
+ w->instructionBuffer.end());
+ if (w->pendingFetch) {
+ w->dropFetch = true;
+ }
+
+ // if all work-items have completed, then wave-front is done
+ if (w->init_mask.none()) {
+ w->status = Wavefront::S_STOPPED;
+
+ int32_t refCount = w->computeUnit->getLds().
+ decreaseRefCounter(w->dispatchid, w->wg_id);
+
+ DPRINTF(GPUExec, "CU%d: decrease ref ctr WG[%d] to [%d]\n",
+ w->computeUnit->cu_id, w->wg_id, refCount);
+
+ // free the vector registers of the completed wavefront
+ w->computeUnit->vectorRegsReserved[w->simdId] -=
+ w->reservedVectorRegs;
+
+ assert(w->computeUnit->vectorRegsReserved[w->simdId] >= 0);
+
+ uint32_t endIndex = (w->startVgprIndex +
+ w->reservedVectorRegs - 1) %
+ w->computeUnit->vrf[w->simdId]->numRegs();
+
+ w->computeUnit->vrf[w->simdId]->manager->
+ freeRegion(w->startVgprIndex, endIndex);
+
+ w->reservedVectorRegs = 0;
+ w->startVgprIndex = 0;
+ w->computeUnit->completedWfs++;
+
+ DPRINTF(GPUExec, "Doing return for CU%d: WF[%d][%d][%d]\n",
+ w->computeUnit->cu_id, w->simdId, w->wfSlotId, w->wfDynId);
+
+ if (!refCount) {
+ // Notify Memory System of Kernel Completion
+ // Kernel End = isKernel + isRelease
+ w->status = Wavefront::S_RETURNING;
+ GPUDynInstPtr local_mempacket = gpuDynInst;
+ local_mempacket->memoryOrder = Enums::MEMORY_ORDER_SC_RELEASE;
+ local_mempacket->scope = Enums::MEMORY_SCOPE_SYSTEM;
+ local_mempacket->useContinuation = false;
+ local_mempacket->simdId = w->simdId;
+ local_mempacket->wfSlotId = w->wfSlotId;
+ local_mempacket->wfDynId = w->wfDynId;
+ w->computeUnit->injectGlobalMemFence(local_mempacket, true);
+ } else {
+ w->computeUnit->shader->dispatcher->scheduleDispatch();
+ }
+ }
+ }
+
+ void
+ Barrier::execute(GPUDynInstPtr gpuDynInst)
+ {
+ Wavefront *w = gpuDynInst->wavefront();
+
+ assert(w->barrier_cnt == w->old_barrier_cnt);
+ w->barrier_cnt = w->old_barrier_cnt + 1;
+ w->stalledAtBarrier = true;
+ }
+} // namespace HsailISA
diff --git a/src/arch/hsail/insts/mem.cc b/src/arch/hsail/insts/mem.cc
new file mode 100644
index 000000000..97d4c902b
--- /dev/null
+++ b/src/arch/hsail/insts/mem.cc
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#include "arch/hsail/insts/mem.hh"
+
+#include "arch/hsail/Brig.h"
+#include "enums/OpType.hh"
+
+using namespace Brig;
+
+namespace HsailISA
+{
+ const char* atomicOpToString(BrigAtomicOperation brigOp);
+
+ Enums::MemOpType
+ brigAtomicToMemOpType(BrigOpcode brigOpCode, BrigAtomicOperation brigOp)
+ {
+ if (brigOpCode == Brig::BRIG_OPCODE_ATOMIC) {
+ switch (brigOp) {
+ case BRIG_ATOMIC_AND:
+ return Enums::MO_AAND;
+ case BRIG_ATOMIC_OR:
+ return Enums::MO_AOR;
+ case BRIG_ATOMIC_XOR:
+ return Enums::MO_AXOR;
+ case BRIG_ATOMIC_CAS:
+ return Enums::MO_ACAS;
+ case BRIG_ATOMIC_EXCH:
+ return Enums::MO_AEXCH;
+ case BRIG_ATOMIC_ADD:
+ return Enums::MO_AADD;
+ case BRIG_ATOMIC_WRAPINC:
+ return Enums::MO_AINC;
+ case BRIG_ATOMIC_WRAPDEC:
+ return Enums::MO_ADEC;
+ case BRIG_ATOMIC_MIN:
+ return Enums::MO_AMIN;
+ case BRIG_ATOMIC_MAX:
+ return Enums::MO_AMAX;
+ case BRIG_ATOMIC_SUB:
+ return Enums::MO_ASUB;
+ default:
+ fatal("Bad BrigAtomicOperation code %d\n", brigOp);
+ }
+ } else if (brigOpCode == Brig::BRIG_OPCODE_ATOMICNORET) {
+ switch (brigOp) {
+ case BRIG_ATOMIC_AND:
+ return Enums::MO_ANRAND;
+ case BRIG_ATOMIC_OR:
+ return Enums::MO_ANROR;
+ case BRIG_ATOMIC_XOR:
+ return Enums::MO_ANRXOR;
+ case BRIG_ATOMIC_CAS:
+ return Enums::MO_ANRCAS;
+ case BRIG_ATOMIC_EXCH:
+ return Enums::MO_ANREXCH;
+ case BRIG_ATOMIC_ADD:
+ return Enums::MO_ANRADD;
+ case BRIG_ATOMIC_WRAPINC:
+ return Enums::MO_ANRINC;
+ case BRIG_ATOMIC_WRAPDEC:
+ return Enums::MO_ANRDEC;
+ case BRIG_ATOMIC_MIN:
+ return Enums::MO_ANRMIN;
+ case BRIG_ATOMIC_MAX:
+ return Enums::MO_ANRMAX;
+ case BRIG_ATOMIC_SUB:
+ return Enums::MO_ANRSUB;
+ default:
+ fatal("Bad BrigAtomicOperation code %d\n", brigOp);
+ }
+ } else {
+ fatal("Bad BrigAtomicOpcode %d\n", brigOpCode);
+ }
+ }
+
+ const char*
+ atomicOpToString(BrigAtomicOperation brigOp)
+ {
+ switch (brigOp) {
+ case BRIG_ATOMIC_AND:
+ return "and";
+ case BRIG_ATOMIC_OR:
+ return "or";
+ case BRIG_ATOMIC_XOR:
+ return "xor";
+ case BRIG_ATOMIC_CAS:
+ return "cas";
+ case BRIG_ATOMIC_EXCH:
+ return "exch";
+ case BRIG_ATOMIC_ADD:
+ return "add";
+ case BRIG_ATOMIC_WRAPINC:
+ return "inc";
+ case BRIG_ATOMIC_WRAPDEC:
+ return "dec";
+ case BRIG_ATOMIC_MIN:
+ return "min";
+ case BRIG_ATOMIC_MAX:
+ return "max";
+ case BRIG_ATOMIC_SUB:
+ return "sub";
+ default:
+ return "unknown";
+ }
+ }
+} // namespace HsailISA
diff --git a/src/arch/hsail/insts/mem.hh b/src/arch/hsail/insts/mem.hh
new file mode 100644
index 000000000..d3ce76dee
--- /dev/null
+++ b/src/arch/hsail/insts/mem.hh
@@ -0,0 +1,1629 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#ifndef __ARCH_HSAIL_INSTS_MEM_HH__
+#define __ARCH_HSAIL_INSTS_MEM_HH__
+
+#include "arch/hsail/insts/decl.hh"
+#include "arch/hsail/insts/gpu_static_inst.hh"
+#include "arch/hsail/operand.hh"
+
+namespace HsailISA
+{
+ class MemInst
+ {
+ public:
+ MemInst() : size(0), addr_operand(nullptr) { }
+
+ MemInst(Enums::MemType m_type)
+ {
+ if (m_type == Enums::M_U64 ||
+ m_type == Enums::M_S64 ||
+ m_type == Enums::M_F64) {
+ size = 8;
+ } else if (m_type == Enums::M_U32 ||
+ m_type == Enums::M_S32 ||
+ m_type == Enums::M_F32) {
+ size = 4;
+ } else if (m_type == Enums::M_U16 ||
+ m_type == Enums::M_S16 ||
+ m_type == Enums::M_F16) {
+ size = 2;
+ } else {
+ size = 1;
+ }
+
+ addr_operand = nullptr;
+ }
+
+ void
+ init_addr(AddrOperandBase *_addr_operand)
+ {
+ addr_operand = _addr_operand;
+ }
+
+ private:
+ int size;
+ AddrOperandBase *addr_operand;
+
+ public:
+ int getMemOperandSize() { return size; }
+ AddrOperandBase *getAddressOperand() { return addr_operand; }
+ };
+
+ template<typename DestOperandType, typename AddrOperandType>
+ class LdaInstBase : public HsailGPUStaticInst
+ {
+ public:
+ typename DestOperandType::DestOperand dest;
+ AddrOperandType addr;
+
+ LdaInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj,
+ const char *_opcode)
+ : HsailGPUStaticInst(obj, _opcode)
+ {
+ using namespace Brig;
+
+ unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+ dest.init(op_offs, obj);
+ op_offs = obj->getOperandPtr(ib->operands, 1);
+ addr.init(op_offs, obj);
+ }
+
+ int numSrcRegOperands() { return(this->addr.isVectorRegister()); }
+ int numDstRegOperands() { return dest.isVectorRegister(); }
+ bool isVectorRegister(int operandIndex)
+ {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ return((operandIndex == 0) ? dest.isVectorRegister() :
+ this->addr.isVectorRegister());
+ }
+ bool isCondRegister(int operandIndex)
+ {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ return((operandIndex == 0) ? dest.isCondRegister() :
+ this->addr.isCondRegister());
+ }
+ bool isScalarRegister(int operandIndex)
+ {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ return((operandIndex == 0) ? dest.isScalarRegister() :
+ this->addr.isScalarRegister());
+ }
+ bool isSrcOperand(int operandIndex)
+ {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if (operandIndex > 0)
+ return(this->addr.isVectorRegister());
+ return false;
+ }
+ bool isDstOperand(int operandIndex) {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ return(operandIndex == 0);
+ }
+ int getOperandSize(int operandIndex)
+ {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ return((operandIndex == 0) ? dest.opSize() :
+ this->addr.opSize());
+ }
+ int getRegisterIndex(int operandIndex)
+ {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ return((operandIndex == 0) ? dest.regIndex() :
+ this->addr.regIndex());
+ }
+ int getNumOperands()
+ {
+ if (this->addr.isVectorRegister())
+ return 2;
+ return 1;
+ }
+ };
+
+ template<typename DestDataType, typename AddrOperandType>
+ class LdaInst :
+ public LdaInstBase<typename DestDataType::OperandType, AddrOperandType>,
+ public MemInst
+ {
+ public:
+ void generateDisassembly();
+
+ LdaInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
+ const char *_opcode)
+ : LdaInstBase<typename DestDataType::OperandType,
+ AddrOperandType>(ib, obj, _opcode)
+ {
+ init_addr(&this->addr);
+ }
+
+ void execute(GPUDynInstPtr gpuDynInst);
+ };
+
+ template<typename DataType>
+ GPUStaticInst*
+ decodeLda(const Brig::BrigInstBase *ib, const BrigObject *obj)
+ {
+ unsigned op_offs = obj->getOperandPtr(ib->operands, 1);
+ BrigRegOperandInfo regDataType = findRegDataType(op_offs, obj);
+
+ if (regDataType.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) {
+ return new LdaInst<DataType, NoRegAddrOperand>(ib, obj, "ldas");
+ } else if (regDataType.kind == Brig::BRIG_KIND_OPERAND_REGISTER) {
+ // V2/V4 not allowed
+ switch (regDataType.regKind) {
+ case Brig::BRIG_REGISTER_KIND_SINGLE:
+ return new LdaInst<DataType, SRegAddrOperand>(ib, obj, "ldas");
+ case Brig::BRIG_REGISTER_KIND_DOUBLE:
+ return new LdaInst<DataType, DRegAddrOperand>(ib, obj, "ldas");
+ default:
+ fatal("Bad ldas register operand type %d\n", regDataType.type);
+ }
+ } else {
+ fatal("Bad ldas register operand kind %d\n", regDataType.kind);
+ }
+ }
+
+ template<typename MemOperandType, typename DestOperandType,
+ typename AddrOperandType>
+ class LdInstBase : public HsailGPUStaticInst
+ {
+ public:
+ Brig::BrigWidth8_t width;
+ typename DestOperandType::DestOperand dest;
+ AddrOperandType addr;
+
+ Brig::BrigSegment segment;
+ Brig::BrigMemoryOrder memoryOrder;
+ Brig::BrigMemoryScope memoryScope;
+ unsigned int equivClass;
+ bool isArgLoad()
+ {
+ return segment == Brig::BRIG_SEGMENT_KERNARG ||
+ segment == Brig::BRIG_SEGMENT_ARG;
+ }
+ void
+ initLd(const Brig::BrigInstBase *ib, const BrigObject *obj,
+ const char *_opcode)
+ {
+ using namespace Brig;
+
+ const BrigInstMem *ldst = (const BrigInstMem*)ib;
+
+ segment = (BrigSegment)ldst->segment;
+ memoryOrder = BRIG_MEMORY_ORDER_NONE;
+ memoryScope = BRIG_MEMORY_SCOPE_NONE;
+ equivClass = ldst->equivClass;
+
+ switch (segment) {
+ case BRIG_SEGMENT_GLOBAL:
+ o_type = Enums::OT_GLOBAL_READ;
+ break;
+
+ case BRIG_SEGMENT_GROUP:
+ o_type = Enums::OT_SHARED_READ;
+ break;
+
+ case BRIG_SEGMENT_PRIVATE:
+ o_type = Enums::OT_PRIVATE_READ;
+ break;
+
+ case BRIG_SEGMENT_READONLY:
+ o_type = Enums::OT_READONLY_READ;
+ break;
+
+ case BRIG_SEGMENT_SPILL:
+ o_type = Enums::OT_SPILL_READ;
+ break;
+
+ case BRIG_SEGMENT_FLAT:
+ o_type = Enums::OT_FLAT_READ;
+ break;
+
+ case BRIG_SEGMENT_KERNARG:
+ o_type = Enums::OT_KERN_READ;
+ break;
+
+ case BRIG_SEGMENT_ARG:
+ o_type = Enums::OT_ARG;
+ break;
+
+ default:
+ panic("Ld: segment %d not supported\n", segment);
+ }
+
+ width = ldst->width;
+ unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+ const Brig::BrigOperand *brigOp = obj->getOperand(op_offs);
+ if (brigOp->kind == BRIG_KIND_OPERAND_REGISTER)
+ dest.init(op_offs, obj);
+
+ op_offs = obj->getOperandPtr(ib->operands, 1);
+ addr.init(op_offs, obj);
+ }
+
+ void
+ initAtomicLd(const Brig::BrigInstBase *ib, const BrigObject *obj,
+ const char *_opcode)
+ {
+ using namespace Brig;
+
+ const BrigInstAtomic *at = (const BrigInstAtomic*)ib;
+
+ segment = (BrigSegment)at->segment;
+ memoryOrder = (BrigMemoryOrder)at->memoryOrder;
+ memoryScope = (BrigMemoryScope)at->memoryScope;
+ equivClass = 0;
+
+ switch (segment) {
+ case BRIG_SEGMENT_GLOBAL:
+ o_type = Enums::OT_GLOBAL_READ;
+ break;
+
+ case BRIG_SEGMENT_GROUP:
+ o_type = Enums::OT_SHARED_READ;
+ break;
+
+ case BRIG_SEGMENT_PRIVATE:
+ o_type = Enums::OT_PRIVATE_READ;
+ break;
+
+ case BRIG_SEGMENT_READONLY:
+ o_type = Enums::OT_READONLY_READ;
+ break;
+
+ case BRIG_SEGMENT_SPILL:
+ o_type = Enums::OT_SPILL_READ;
+ break;
+
+ case BRIG_SEGMENT_FLAT:
+ o_type = Enums::OT_FLAT_READ;
+ break;
+
+ case BRIG_SEGMENT_KERNARG:
+ o_type = Enums::OT_KERN_READ;
+ break;
+
+ case BRIG_SEGMENT_ARG:
+ o_type = Enums::OT_ARG;
+ break;
+
+ default:
+ panic("Ld: segment %d not supported\n", segment);
+ }
+
+ width = BRIG_WIDTH_1;
+ unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+ const Brig::BrigOperand *brigOp = obj->getOperand(op_offs);
+
+ if (brigOp->kind == BRIG_KIND_OPERAND_REGISTER)
+ dest.init(op_offs, obj);
+
+ op_offs = obj->getOperandPtr(ib->operands,1);
+ addr.init(op_offs, obj);
+ }
+
+ LdInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj,
+ const char *_opcode)
+ : HsailGPUStaticInst(obj, _opcode)
+ {
+ using namespace Brig;
+
+ if (ib->opcode == BRIG_OPCODE_LD) {
+ initLd(ib, obj, _opcode);
+ } else {
+ initAtomicLd(ib, obj, _opcode);
+ }
+ }
+
+ int numSrcRegOperands() { return(this->addr.isVectorRegister()); }
+ int numDstRegOperands() { return dest.isVectorRegister(); }
+ int getNumOperands()
+ {
+ if (this->addr.isVectorRegister())
+ return 2;
+ else
+ return 1;
+ }
+ bool isVectorRegister(int operandIndex)
+ {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ return((operandIndex == 0) ? dest.isVectorRegister() :
+ this->addr.isVectorRegister());
+ }
+ bool isCondRegister(int operandIndex)
+ {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ return((operandIndex == 0) ? dest.isCondRegister() :
+ this->addr.isCondRegister());
+ }
+ bool isScalarRegister(int operandIndex)
+ {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ return((operandIndex == 0) ? dest.isScalarRegister() :
+ this->addr.isScalarRegister());
+ }
+ bool isSrcOperand(int operandIndex)
+ {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if (operandIndex > 0)
+ return(this->addr.isVectorRegister());
+ return false;
+ }
+ bool isDstOperand(int operandIndex)
+ {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ return(operandIndex == 0);
+ }
+ int getOperandSize(int operandIndex)
+ {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ return((operandIndex == 0) ? dest.opSize() :
+ this->addr.opSize());
+ }
+ int getRegisterIndex(int operandIndex)
+ {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ return((operandIndex == 0) ? dest.regIndex() :
+ this->addr.regIndex());
+ }
+ };
+
+ template<typename MemDataType, typename DestDataType,
+ typename AddrOperandType>
+ class LdInst :
+ public LdInstBase<typename MemDataType::CType,
+ typename DestDataType::OperandType, AddrOperandType>,
+ public MemInst
+ {
+ typename DestDataType::OperandType::DestOperand dest_vect[4];
+ uint16_t num_dest_operands;
+ void generateDisassembly();
+
+ public:
+ LdInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
+ const char *_opcode)
+ : LdInstBase<typename MemDataType::CType,
+ typename DestDataType::OperandType,
+ AddrOperandType>(ib, obj, _opcode),
+ MemInst(MemDataType::memType)
+ {
+ init_addr(&this->addr);
+
+ unsigned op_offs = obj->getOperandPtr(ib->operands,0);
+ const Brig::BrigOperand *brigOp = obj->getOperand(op_offs);
+
+ if (brigOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST) {
+ const Brig::BrigOperandOperandList *brigRegVecOp =
+ (const Brig::BrigOperandOperandList*)brigOp;
+
+ num_dest_operands =
+ *((unsigned*)obj->getData(brigRegVecOp->elements)) / 4;
+
+ assert(num_dest_operands <= 4);
+ } else {
+ num_dest_operands = 1;
+ }
+
+ if (num_dest_operands > 1) {
+ assert(brigOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST);
+
+ for (int i = 0; i < num_dest_operands; ++i) {
+ dest_vect[i].init_from_vect(op_offs, obj, i);
+ }
+ }
+ }
+
+ void
+ initiateAcc(GPUDynInstPtr gpuDynInst) override
+ {
+ typedef typename MemDataType::CType c0;
+
+ gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
+
+ if (num_dest_operands > 1) {
+ for (int i = 0; i < VSZ; ++i)
+ if (gpuDynInst->exec_mask[i])
+ gpuDynInst->statusVector.push_back(num_dest_operands);
+ else
+ gpuDynInst->statusVector.push_back(0);
+ }
+
+ for (int k = 0; k < num_dest_operands; ++k) {
+
+ c0 *d = &((c0*)gpuDynInst->d_data)[k * VSZ];
+
+ for (int i = 0; i < VSZ; ++i) {
+ if (gpuDynInst->exec_mask[i]) {
+ Addr vaddr = gpuDynInst->addr[i] + k * sizeof(c0);
+
+ if (isLocalMem()) {
+ // load from shared memory
+ *d = gpuDynInst->wavefront()->ldsChunk->
+ read<c0>(vaddr);
+ } else {
+ Request *req = new Request(0, vaddr, sizeof(c0), 0,
+ gpuDynInst->computeUnit()->masterId(),
+ 0, gpuDynInst->wfDynId, i);
+
+ gpuDynInst->setRequestFlags(req);
+ PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
+ pkt->dataStatic(d);
+
+ if (gpuDynInst->computeUnit()->shader->
+ separate_acquire_release &&
+ gpuDynInst->memoryOrder ==
+ Enums::MEMORY_ORDER_SC_ACQUIRE) {
+ // if this load has acquire semantics,
+ // set the response continuation function
+ // to perform an Acquire request
+ gpuDynInst->execContinuation =
+ &GPUStaticInst::execLdAcq;
+
+ gpuDynInst->useContinuation = true;
+ } else {
+ // the request will be finished when
+ // the load completes
+ gpuDynInst->useContinuation = false;
+ }
+ // translation is performed in sendRequest()
+ gpuDynInst->computeUnit()->sendRequest(gpuDynInst,
+ i, pkt);
+ }
+ }
+ ++d;
+ }
+ }
+
+ gpuDynInst->updateStats();
+ }
+
+ private:
+ void
+ execLdAcq(GPUDynInstPtr gpuDynInst) override
+ {
+ // after the load has complete and if the load has acquire
+ // semantics, issue an acquire request.
+ if (!isLocalMem()) {
+ if (gpuDynInst->computeUnit()->shader->separate_acquire_release
+ && gpuDynInst->memoryOrder ==
+ Enums::MEMORY_ORDER_SC_ACQUIRE) {
+ gpuDynInst->statusBitVector = VectorMask(1);
+ gpuDynInst->useContinuation = false;
+ // create request
+ Request *req = new Request(0, 0, 0, 0,
+ gpuDynInst->computeUnit()->masterId(),
+ 0, gpuDynInst->wfDynId, -1);
+ req->setFlags(Request::ACQUIRE);
+ gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req);
+ }
+ }
+ }
+
+ public:
+ bool
+ isLocalMem() const override
+ {
+ return this->segment == Brig::BRIG_SEGMENT_GROUP;
+ }
+
+ bool isVectorRegister(int operandIndex)
+ {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if ((num_dest_operands != getNumOperands()) &&
+ (operandIndex == (getNumOperands()-1)))
+ return(this->addr.isVectorRegister());
+ if (num_dest_operands > 1) {
+ return dest_vect[operandIndex].isVectorRegister();
+ }
+ else if (num_dest_operands == 1) {
+ return LdInstBase<typename MemDataType::CType,
+ typename DestDataType::OperandType,
+ AddrOperandType>::dest.isVectorRegister();
+ }
+ return false;
+ }
+ bool isCondRegister(int operandIndex)
+ {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if ((num_dest_operands != getNumOperands()) &&
+ (operandIndex == (getNumOperands()-1)))
+ return(this->addr.isCondRegister());
+ if (num_dest_operands > 1)
+ return dest_vect[operandIndex].isCondRegister();
+ else if (num_dest_operands == 1)
+ return LdInstBase<typename MemDataType::CType,
+ typename DestDataType::OperandType,
+ AddrOperandType>::dest.isCondRegister();
+ return false;
+ }
+ bool isScalarRegister(int operandIndex)
+ {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if ((num_dest_operands != getNumOperands()) &&
+ (operandIndex == (getNumOperands()-1)))
+ return(this->addr.isScalarRegister());
+ if (num_dest_operands > 1)
+ return dest_vect[operandIndex].isScalarRegister();
+ else if (num_dest_operands == 1)
+ return LdInstBase<typename MemDataType::CType,
+ typename DestDataType::OperandType,
+ AddrOperandType>::dest.isScalarRegister();
+ return false;
+ }
+ bool isSrcOperand(int operandIndex)
+ {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if ((num_dest_operands != getNumOperands()) &&
+ (operandIndex == (getNumOperands()-1)))
+ return(this->addr.isVectorRegister());
+ return false;
+ }
+ bool isDstOperand(int operandIndex)
+ {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if ((num_dest_operands != getNumOperands()) &&
+ (operandIndex == (getNumOperands()-1)))
+ return false;
+ return true;
+ }
+ int getOperandSize(int operandIndex)
+ {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if ((num_dest_operands != getNumOperands()) &&
+ (operandIndex == (getNumOperands()-1)))
+ return(this->addr.opSize());
+ if (num_dest_operands > 1)
+ return(dest_vect[operandIndex].opSize());
+ else if (num_dest_operands == 1)
+ return(LdInstBase<typename MemDataType::CType,
+ typename DestDataType::OperandType,
+ AddrOperandType>::dest.opSize());
+ return 0;
+ }
+ int getRegisterIndex(int operandIndex)
+ {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if ((num_dest_operands != getNumOperands()) &&
+ (operandIndex == (getNumOperands()-1)))
+ return(this->addr.regIndex());
+ if (num_dest_operands > 1)
+ return(dest_vect[operandIndex].regIndex());
+ else if (num_dest_operands == 1)
+ return(LdInstBase<typename MemDataType::CType,
+ typename DestDataType::OperandType,
+ AddrOperandType>::dest.regIndex());
+ return -1;
+ }
+ int getNumOperands()
+ {
+ if (this->addr.isVectorRegister() || this->addr.isScalarRegister())
+ return(num_dest_operands+1);
+ else
+ return(num_dest_operands);
+ }
+ void execute(GPUDynInstPtr gpuDynInst);
+ };
+
+ template<typename MemDT, typename DestDT>
+ GPUStaticInst*
+ decodeLd2(const Brig::BrigInstBase *ib, const BrigObject *obj)
+ {
+ unsigned op_offs = obj->getOperandPtr(ib->operands,1);
+ BrigRegOperandInfo tmp = findRegDataType(op_offs, obj);
+
+ if (tmp.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) {
+ return new LdInst<MemDT, DestDT, NoRegAddrOperand>(ib, obj, "ld");
+ } else if (tmp.kind == Brig::BRIG_KIND_OPERAND_REGISTER ||
+ tmp.kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST) {
+ switch (tmp.regKind) {
+ case Brig::BRIG_REGISTER_KIND_SINGLE:
+ return new LdInst<MemDT, DestDT,
+ SRegAddrOperand>(ib, obj, "ld");
+ case Brig::BRIG_REGISTER_KIND_DOUBLE:
+ return new LdInst<MemDT, DestDT,
+ DRegAddrOperand>(ib, obj, "ld");
+ default:
+ fatal("Bad ld register operand type %d\n", tmp.regKind);
+ }
+ } else {
+ fatal("Bad ld register operand kind %d\n", tmp.kind);
+ }
+ }
+
+ template<typename MemDT>
+ GPUStaticInst*
+ decodeLd(const Brig::BrigInstBase *ib, const BrigObject *obj)
+ {
+ unsigned op_offs = obj->getOperandPtr(ib->operands,0);
+ BrigRegOperandInfo dest = findRegDataType(op_offs, obj);
+
+ assert(dest.kind == Brig::BRIG_KIND_OPERAND_REGISTER ||
+ dest.kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST);
+ switch(dest.regKind) {
+ case Brig::BRIG_REGISTER_KIND_SINGLE:
+ switch (ib->type) {
+ case Brig::BRIG_TYPE_B8:
+ case Brig::BRIG_TYPE_B16:
+ case Brig::BRIG_TYPE_B32:
+ return decodeLd2<MemDT, B32>(ib, obj);
+ case Brig::BRIG_TYPE_U8:
+ case Brig::BRIG_TYPE_U16:
+ case Brig::BRIG_TYPE_U32:
+ return decodeLd2<MemDT, U32>(ib, obj);
+ case Brig::BRIG_TYPE_S8:
+ case Brig::BRIG_TYPE_S16:
+ case Brig::BRIG_TYPE_S32:
+ return decodeLd2<MemDT, S32>(ib, obj);
+ case Brig::BRIG_TYPE_F16:
+ case Brig::BRIG_TYPE_F32:
+ return decodeLd2<MemDT, U32>(ib, obj);
+ default:
+ fatal("Bad ld register operand type %d, %d\n",
+ dest.regKind, ib->type);
+ };
+ case Brig::BRIG_REGISTER_KIND_DOUBLE:
+ switch (ib->type) {
+ case Brig::BRIG_TYPE_B64:
+ return decodeLd2<MemDT, B64>(ib, obj);
+ case Brig::BRIG_TYPE_U64:
+ return decodeLd2<MemDT, U64>(ib, obj);
+ case Brig::BRIG_TYPE_S64:
+ return decodeLd2<MemDT, S64>(ib, obj);
+ case Brig::BRIG_TYPE_F64:
+ return decodeLd2<MemDT, U64>(ib, obj);
+ default:
+ fatal("Bad ld register operand type %d, %d\n",
+ dest.regKind, ib->type);
+ };
+ default:
+ fatal("Bad ld register operand type %d, %d\n", dest.regKind,
+ ib->type);
+ }
+ }
+
+ template<typename MemDataType, typename SrcOperandType,
+ typename AddrOperandType>
+ class StInstBase : public HsailGPUStaticInst
+ {
+ public:
+ typename SrcOperandType::SrcOperand src;
+ AddrOperandType addr;
+
+ Brig::BrigSegment segment;
+ Brig::BrigMemoryScope memoryScope;
+ Brig::BrigMemoryOrder memoryOrder;
+ unsigned int equivClass;
+
+ void
+ initSt(const Brig::BrigInstBase *ib, const BrigObject *obj,
+ const char *_opcode)
+ {
+ using namespace Brig;
+
+ const BrigInstMem *ldst = (const BrigInstMem*)ib;
+
+ segment = (BrigSegment)ldst->segment;
+ memoryOrder = BRIG_MEMORY_ORDER_NONE;
+ memoryScope = BRIG_MEMORY_SCOPE_NONE;
+ equivClass = ldst->equivClass;
+
+ switch (segment) {
+ case BRIG_SEGMENT_GLOBAL:
+ o_type = Enums::OT_GLOBAL_WRITE;
+ break;
+
+ case BRIG_SEGMENT_GROUP:
+ o_type = Enums::OT_SHARED_WRITE;
+ break;
+
+ case BRIG_SEGMENT_PRIVATE:
+ o_type = Enums::OT_PRIVATE_WRITE;
+ break;
+
+ case BRIG_SEGMENT_READONLY:
+ o_type = Enums::OT_READONLY_WRITE;
+ break;
+
+ case BRIG_SEGMENT_SPILL:
+ o_type = Enums::OT_SPILL_WRITE;
+ break;
+
+ case BRIG_SEGMENT_FLAT:
+ o_type = Enums::OT_FLAT_WRITE;
+ break;
+
+ case BRIG_SEGMENT_ARG:
+ o_type = Enums::OT_ARG;
+ break;
+
+ default:
+ panic("St: segment %d not supported\n", segment);
+ }
+
+ unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+ const BrigOperand *baseOp = obj->getOperand(op_offs);
+
+ if ((baseOp->kind == BRIG_KIND_OPERAND_CONSTANT_BYTES) ||
+ (baseOp->kind == BRIG_KIND_OPERAND_REGISTER)) {
+ src.init(op_offs, obj);
+ }
+
+ op_offs = obj->getOperandPtr(ib->operands, 1);
+ addr.init(op_offs, obj);
+ }
+
+ void
+ initAtomicSt(const Brig::BrigInstBase *ib, const BrigObject *obj,
+ const char *_opcode)
+ {
+ using namespace Brig;
+
+ const BrigInstAtomic *at = (const BrigInstAtomic*)ib;
+
+ segment = (BrigSegment)at->segment;
+ memoryScope = (BrigMemoryScope)at->memoryScope;
+ memoryOrder = (BrigMemoryOrder)at->memoryOrder;
+ equivClass = 0;
+
+ switch (segment) {
+ case BRIG_SEGMENT_GLOBAL:
+ o_type = Enums::OT_GLOBAL_WRITE;
+ break;
+
+ case BRIG_SEGMENT_GROUP:
+ o_type = Enums::OT_SHARED_WRITE;
+ break;
+
+ case BRIG_SEGMENT_PRIVATE:
+ o_type = Enums::OT_PRIVATE_WRITE;
+ break;
+
+ case BRIG_SEGMENT_READONLY:
+ o_type = Enums::OT_READONLY_WRITE;
+ break;
+
+ case BRIG_SEGMENT_SPILL:
+ o_type = Enums::OT_SPILL_WRITE;
+ break;
+
+ case BRIG_SEGMENT_FLAT:
+ o_type = Enums::OT_FLAT_WRITE;
+ break;
+
+ case BRIG_SEGMENT_ARG:
+ o_type = Enums::OT_ARG;
+ break;
+
+ default:
+ panic("St: segment %d not supported\n", segment);
+ }
+
+ unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+ addr.init(op_offs, obj);
+
+ op_offs = obj->getOperandPtr(ib->operands, 1);
+ src.init(op_offs, obj);
+ }
+
+ StInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj,
+ const char *_opcode)
+ : HsailGPUStaticInst(obj, _opcode)
+ {
+ using namespace Brig;
+
+ if (ib->opcode == BRIG_OPCODE_ST) {
+ initSt(ib, obj, _opcode);
+ } else {
+ initAtomicSt(ib, obj, _opcode);
+ }
+ }
+
+ int numDstRegOperands() { return 0; }
+ int numSrcRegOperands()
+ {
+ return src.isVectorRegister() + this->addr.isVectorRegister();
+ }
+ int getNumOperands()
+ {
+ if (this->addr.isVectorRegister() || this->addr.isScalarRegister())
+ return 2;
+ else
+ return 1;
+ }
+ bool isVectorRegister(int operandIndex)
+ {
+ assert(operandIndex >= 0 && operandIndex < getNumOperands());
+ return !operandIndex ? src.isVectorRegister() :
+ this->addr.isVectorRegister();
+ }
+ bool isCondRegister(int operandIndex)
+ {
+ assert(operandIndex >= 0 && operandIndex < getNumOperands());
+ return !operandIndex ? src.isCondRegister() :
+ this->addr.isCondRegister();
+ }
+ bool isScalarRegister(int operandIndex)
+ {
+ assert(operandIndex >= 0 && operandIndex < getNumOperands());
+ return !operandIndex ? src.isScalarRegister() :
+ this->addr.isScalarRegister();
+ }
+ bool isSrcOperand(int operandIndex)
+ {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ return true;
+ }
+ bool isDstOperand(int operandIndex) { return false; }
+ int getOperandSize(int operandIndex)
+ {
+ assert(operandIndex >= 0 && operandIndex < getNumOperands());
+ return !operandIndex ? src.opSize() : this->addr.opSize();
+ }
+ int getRegisterIndex(int operandIndex)
+ {
+ assert(operandIndex >= 0 && operandIndex < getNumOperands());
+ return !operandIndex ? src.regIndex() : this->addr.regIndex();
+ }
+ };
+
+
+ template<typename MemDataType, typename SrcDataType,
+ typename AddrOperandType>
+ class StInst :
+ public StInstBase<MemDataType, typename SrcDataType::OperandType,
+ AddrOperandType>,
+ public MemInst
+ {
+ public:
+ typename SrcDataType::OperandType::SrcOperand src_vect[4];
+ uint16_t num_src_operands;
+ void generateDisassembly();
+
+ StInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
+ const char *_opcode, int srcIdx)
+ : StInstBase<MemDataType, typename SrcDataType::OperandType,
+ AddrOperandType>(ib, obj, _opcode),
+ MemInst(SrcDataType::memType)
+ {
+ init_addr(&this->addr);
+
+ BrigRegOperandInfo rinfo;
+ unsigned op_offs = obj->getOperandPtr(ib->operands,srcIdx);
+ const Brig::BrigOperand *baseOp = obj->getOperand(op_offs);
+
+ if (baseOp->kind == Brig::BRIG_KIND_OPERAND_CONSTANT_BYTES) {
+ const Brig::BrigOperandConstantBytes *op =
+ (Brig::BrigOperandConstantBytes*)baseOp;
+
+ rinfo = BrigRegOperandInfo((Brig::BrigKind16_t)op->base.kind,
+ Brig::BRIG_TYPE_NONE);
+ } else {
+ rinfo = findRegDataType(op_offs, obj);
+ }
+
+ if (baseOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST) {
+ const Brig::BrigOperandOperandList *brigRegVecOp =
+ (const Brig::BrigOperandOperandList*)baseOp;
+
+ num_src_operands =
+ *((unsigned*)obj->getData(brigRegVecOp->elements)) / 4;
+
+ assert(num_src_operands <= 4);
+ } else {
+ num_src_operands = 1;
+ }
+
+ if (num_src_operands > 1) {
+ assert(baseOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST);
+
+ for (int i = 0; i < num_src_operands; ++i) {
+ src_vect[i].init_from_vect(op_offs, obj, i);
+ }
+ }
+ }
+
+ void
+ initiateAcc(GPUDynInstPtr gpuDynInst) override
+ {
+ // before performing a store, check if this store has
+ // release semantics, and if so issue a release first
+ if (!isLocalMem()) {
+ if (gpuDynInst->computeUnit()->shader->separate_acquire_release
+ && gpuDynInst->memoryOrder ==
+ Enums::MEMORY_ORDER_SC_RELEASE) {
+
+ gpuDynInst->statusBitVector = VectorMask(1);
+ gpuDynInst->execContinuation = &GPUStaticInst::execSt;
+ gpuDynInst->useContinuation = true;
+ // create request
+ Request *req = new Request(0, 0, 0, 0,
+ gpuDynInst->computeUnit()->masterId(),
+ 0, gpuDynInst->wfDynId, -1);
+ req->setFlags(Request::RELEASE);
+ gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req);
+
+ return;
+ }
+ }
+
+ // if there is no release semantic, perform stores immediately
+ execSt(gpuDynInst);
+ }
+
+ bool
+ isLocalMem() const override
+ {
+ return this->segment == Brig::BRIG_SEGMENT_GROUP;
+ }
+
+ private:
+ // execSt may be called through a continuation
+ // if the store had release semantics. see comment for
+ // execSt in gpu_static_inst.hh
+ void
+ execSt(GPUDynInstPtr gpuDynInst) override
+ {
+ typedef typename MemDataType::CType c0;
+
+ gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
+
+ if (num_src_operands > 1) {
+ for (int i = 0; i < VSZ; ++i)
+ if (gpuDynInst->exec_mask[i])
+ gpuDynInst->statusVector.push_back(num_src_operands);
+ else
+ gpuDynInst->statusVector.push_back(0);
+ }
+
+ for (int k = 0; k < num_src_operands; ++k) {
+ c0 *d = &((c0*)gpuDynInst->d_data)[k * VSZ];
+
+ for (int i = 0; i < VSZ; ++i) {
+ if (gpuDynInst->exec_mask[i]) {
+ Addr vaddr = gpuDynInst->addr[i] + k * sizeof(c0);
+
+ if (isLocalMem()) {
+ //store to shared memory
+ gpuDynInst->wavefront()->ldsChunk->write<c0>(vaddr,
+ *d);
+ } else {
+ Request *req =
+ new Request(0, vaddr, sizeof(c0), 0,
+ gpuDynInst->computeUnit()->masterId(),
+ 0, gpuDynInst->wfDynId, i);
+
+ gpuDynInst->setRequestFlags(req);
+ PacketPtr pkt = new Packet(req, MemCmd::WriteReq);
+ pkt->dataStatic<c0>(d);
+
+ // translation is performed in sendRequest()
+ // the request will be finished when the store completes
+ gpuDynInst->useContinuation = false;
+ gpuDynInst->computeUnit()->sendRequest(gpuDynInst,
+ i, pkt);
+
+ }
+ }
+ ++d;
+ }
+ }
+
+ gpuDynInst->updateStats();
+ }
+
+ public:
+ bool isVectorRegister(int operandIndex)
+ {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if (operandIndex == num_src_operands)
+ return this->addr.isVectorRegister();
+ if (num_src_operands > 1)
+ return src_vect[operandIndex].isVectorRegister();
+ else if (num_src_operands == 1)
+ return StInstBase<MemDataType,
+ typename SrcDataType::OperandType,
+ AddrOperandType>::src.isVectorRegister();
+ return false;
+ }
+ bool isCondRegister(int operandIndex)
+ {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if (operandIndex == num_src_operands)
+ return this->addr.isCondRegister();
+ if (num_src_operands > 1)
+ return src_vect[operandIndex].isCondRegister();
+ else if (num_src_operands == 1)
+ return StInstBase<MemDataType,
+ typename SrcDataType::OperandType,
+ AddrOperandType>::src.isCondRegister();
+ return false;
+ }
+ bool isScalarRegister(int operandIndex)
+ {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if (operandIndex == num_src_operands)
+ return this->addr.isScalarRegister();
+ if (num_src_operands > 1)
+ return src_vect[operandIndex].isScalarRegister();
+ else if (num_src_operands == 1)
+ return StInstBase<MemDataType,
+ typename SrcDataType::OperandType,
+ AddrOperandType>::src.isScalarRegister();
+ return false;
+ }
+ bool isSrcOperand(int operandIndex)
+ {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ return true;
+ }
+ bool isDstOperand(int operandIndex) { return false; }
+ int getOperandSize(int operandIndex)
+ {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if (operandIndex == num_src_operands)
+ return this->addr.opSize();
+ if (num_src_operands > 1)
+ return src_vect[operandIndex].opSize();
+ else if (num_src_operands == 1)
+ return StInstBase<MemDataType,
+ typename SrcDataType::OperandType,
+ AddrOperandType>::src.opSize();
+ return 0;
+ }
+ int getRegisterIndex(int operandIndex)
+ {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if (operandIndex == num_src_operands)
+ return this->addr.regIndex();
+ if (num_src_operands > 1)
+ return src_vect[operandIndex].regIndex();
+ else if (num_src_operands == 1)
+ return StInstBase<MemDataType,
+ typename SrcDataType::OperandType,
+ AddrOperandType>::src.regIndex();
+ return -1;
+ }
+ int getNumOperands()
+ {
+ if (this->addr.isVectorRegister() || this->addr.isScalarRegister())
+ return num_src_operands + 1;
+ else
+ return num_src_operands;
+ }
+ void execute(GPUDynInstPtr gpuDynInst);
+ };
+
+ template<typename DataType, typename SrcDataType>
+ GPUStaticInst*
+ decodeSt(const Brig::BrigInstBase *ib, const BrigObject *obj)
+ {
+ int srcIdx = 0;
+ int destIdx = 1;
+ if (ib->opcode == Brig::BRIG_OPCODE_ATOMIC ||
+ ib->opcode == Brig::BRIG_OPCODE_ATOMICNORET) {
+ srcIdx = 1;
+ destIdx = 0;
+ }
+ unsigned op_offs = obj->getOperandPtr(ib->operands,destIdx);
+
+ BrigRegOperandInfo tmp = findRegDataType(op_offs, obj);
+
+ if (tmp.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) {
+ return new StInst<DataType, SrcDataType,
+ NoRegAddrOperand>(ib, obj, "st", srcIdx);
+ } else if (tmp.kind == Brig::BRIG_KIND_OPERAND_REGISTER) {
+ // V2/V4 not allowed
+ switch (tmp.regKind) {
+ case Brig::BRIG_REGISTER_KIND_SINGLE:
+ return new StInst<DataType, SrcDataType,
+ SRegAddrOperand>(ib, obj, "st", srcIdx);
+ case Brig::BRIG_REGISTER_KIND_DOUBLE:
+ return new StInst<DataType, SrcDataType,
+ DRegAddrOperand>(ib, obj, "st", srcIdx);
+ default:
+ fatal("Bad st register operand type %d\n", tmp.type);
+ }
+ } else {
+ fatal("Bad st register operand kind %d\n", tmp.kind);
+ }
+ }
+
+ Enums::MemOpType brigAtomicToMemOpType(Brig::BrigOpcode brigOpCode,
+ Brig::BrigAtomicOperation brigOp);
+
+ template<typename OperandType, typename AddrOperandType, int NumSrcOperands,
+ bool HasDst>
+ class AtomicInstBase : public HsailGPUStaticInst
+ {
+ public:
+ typename OperandType::DestOperand dest;
+ typename OperandType::SrcOperand src[NumSrcOperands];
+ AddrOperandType addr;
+
+ Brig::BrigSegment segment;
+ Brig::BrigMemoryOrder memoryOrder;
+ Brig::BrigAtomicOperation atomicOperation;
+ Brig::BrigMemoryScope memoryScope;
+ Brig::BrigOpcode opcode;
+ Enums::MemOpType opType;
+
+ AtomicInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj,
+ const char *_opcode)
+ : HsailGPUStaticInst(obj, _opcode)
+ {
+ using namespace Brig;
+
+ const BrigInstAtomic *at = (const BrigInstAtomic*)ib;
+
+ segment = (BrigSegment)at->segment;
+ memoryScope = (BrigMemoryScope)at->memoryScope;
+ memoryOrder = (BrigMemoryOrder)at->memoryOrder;
+ atomicOperation = (BrigAtomicOperation)at->atomicOperation;
+ opcode = (BrigOpcode)ib->opcode;
+ opType = brigAtomicToMemOpType(opcode, atomicOperation);
+
+ switch (segment) {
+ case BRIG_SEGMENT_GLOBAL:
+ o_type = Enums::OT_GLOBAL_ATOMIC;
+ break;
+
+ case BRIG_SEGMENT_GROUP:
+ o_type = Enums::OT_SHARED_ATOMIC;
+ break;
+
+ case BRIG_SEGMENT_FLAT:
+ o_type = Enums::OT_FLAT_ATOMIC;
+ break;
+
+ default:
+ panic("Atomic: segment %d not supported\n", segment);
+ }
+
+ if (HasDst) {
+ unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+ dest.init(op_offs, obj);
+
+ op_offs = obj->getOperandPtr(ib->operands, 1);
+ addr.init(op_offs, obj);
+
+ for (int i = 0; i < NumSrcOperands; ++i) {
+ op_offs = obj->getOperandPtr(ib->operands, i + 2);
+ src[i].init(op_offs, obj);
+ }
+ } else {
+
+ unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+ addr.init(op_offs, obj);
+
+ for (int i = 0; i < NumSrcOperands; ++i) {
+ op_offs = obj->getOperandPtr(ib->operands, i + 1);
+ src[i].init(op_offs, obj);
+ }
+ }
+ }
+
+ int numSrcRegOperands()
+ {
+ int operands = 0;
+ for (int i = 0; i < NumSrcOperands; i++) {
+ if (src[i].isVectorRegister() == true) {
+ operands++;
+ }
+ }
+ if (addr.isVectorRegister())
+ operands++;
+ return operands;
+ }
+ int numDstRegOperands() { return dest.isVectorRegister(); }
+ int getNumOperands()
+ {
+ if (addr.isVectorRegister())
+ return(NumSrcOperands + 2);
+ return(NumSrcOperands + 1);
+ }
+ bool isVectorRegister(int operandIndex)
+ {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if (operandIndex < NumSrcOperands)
+ return src[operandIndex].isVectorRegister();
+ else if (operandIndex == NumSrcOperands)
+ return(addr.isVectorRegister());
+ else
+ return dest.isVectorRegister();
+ }
+ bool isCondRegister(int operandIndex)
+ {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if (operandIndex < NumSrcOperands)
+ return src[operandIndex].isCondRegister();
+ else if (operandIndex == NumSrcOperands)
+ return(addr.isCondRegister());
+ else
+ return dest.isCondRegister();
+ }
+ bool isScalarRegister(int operandIndex)
+ {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if (operandIndex < NumSrcOperands)
+ return src[operandIndex].isScalarRegister();
+ else if (operandIndex == NumSrcOperands)
+ return(addr.isScalarRegister());
+ else
+ return dest.isScalarRegister();
+ }
+ bool isSrcOperand(int operandIndex)
+ {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if (operandIndex < NumSrcOperands)
+ return true;
+ else if (operandIndex == NumSrcOperands)
+ return(addr.isVectorRegister());
+ else
+ return false;
+ }
+ bool isDstOperand(int operandIndex)
+ {
+ if (operandIndex <= NumSrcOperands)
+ return false;
+ else
+ return true;
+ }
+ int getOperandSize(int operandIndex)
+ {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if (operandIndex < NumSrcOperands)
+ return(src[operandIndex].opSize());
+ else if (operandIndex == NumSrcOperands)
+ return(addr.opSize());
+ else
+ return(dest.opSize());
+ }
+ int getRegisterIndex(int operandIndex)
+ {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if (operandIndex < NumSrcOperands)
+ return(src[operandIndex].regIndex());
+ else if (operandIndex == NumSrcOperands)
+ return(addr.regIndex());
+ else
+ return(dest.regIndex());
+ return -1;
+ }
+ };
+
+ template<typename MemDataType, typename AddrOperandType, int NumSrcOperands,
+ bool HasDst>
+ class AtomicInst :
+ public AtomicInstBase<typename MemDataType::OperandType,
+ AddrOperandType, NumSrcOperands, HasDst>,
+ public MemInst
+ {
+ public:
+ void generateDisassembly();
+
+ AtomicInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
+ const char *_opcode)
+ : AtomicInstBase<typename MemDataType::OperandType, AddrOperandType,
+ NumSrcOperands, HasDst>
+ (ib, obj, _opcode),
+ MemInst(MemDataType::memType)
+ {
+ init_addr(&this->addr);
+ }
+
+ void
+ initiateAcc(GPUDynInstPtr gpuDynInst) override
+ {
+ // before doing the RMW, check if this atomic has
+ // release semantics, and if so issue a release first
+ if (!isLocalMem()) {
+ if (gpuDynInst->computeUnit()->shader->separate_acquire_release
+ && (gpuDynInst->memoryOrder ==
+ Enums::MEMORY_ORDER_SC_RELEASE || gpuDynInst->memoryOrder ==
+ Enums::MEMORY_ORDER_SC_ACQUIRE_RELEASE)) {
+
+ gpuDynInst->statusBitVector = VectorMask(1);
+
+ gpuDynInst->execContinuation = &GPUStaticInst::execAtomic;
+ gpuDynInst->useContinuation = true;
+
+ // create request
+ Request *req = new Request(0, 0, 0, 0,
+ gpuDynInst->computeUnit()->masterId(),
+ 0, gpuDynInst->wfDynId, -1);
+ req->setFlags(Request::RELEASE);
+ gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req);
+
+ return;
+ }
+ }
+
+ // if there is no release semantic, execute the RMW immediately
+ execAtomic(gpuDynInst);
+
+ }
+
+ void execute(GPUDynInstPtr gpuDynInst);
+
+ bool
+ isLocalMem() const override
+ {
+ return this->segment == Brig::BRIG_SEGMENT_GROUP;
+ }
+
+ private:
+ // execAtomic may be called through a continuation
+ // if the RMW had release semantics. see comment for
+ // execContinuation in gpu_dyn_inst.hh
+ void
+ execAtomic(GPUDynInstPtr gpuDynInst) override
+ {
+ gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
+
+ typedef typename MemDataType::CType c0;
+
+ c0 *d = &((c0*) gpuDynInst->d_data)[0];
+ c0 *e = &((c0*) gpuDynInst->a_data)[0];
+ c0 *f = &((c0*) gpuDynInst->x_data)[0];
+
+ for (int i = 0; i < VSZ; ++i) {
+ if (gpuDynInst->exec_mask[i]) {
+ Addr vaddr = gpuDynInst->addr[i];
+
+ if (isLocalMem()) {
+ Wavefront *wavefront = gpuDynInst->wavefront();
+ *d = wavefront->ldsChunk->read<c0>(vaddr);
+
+ switch (this->opType) {
+ case Enums::MO_AADD:
+ case Enums::MO_ANRADD:
+ wavefront->ldsChunk->write<c0>(vaddr,
+ wavefront->ldsChunk->read<c0>(vaddr) + (*e));
+ break;
+ case Enums::MO_ASUB:
+ case Enums::MO_ANRSUB:
+ wavefront->ldsChunk->write<c0>(vaddr,
+ wavefront->ldsChunk->read<c0>(vaddr) - (*e));
+ break;
+ case Enums::MO_AMAX:
+ case Enums::MO_ANRMAX:
+ wavefront->ldsChunk->write<c0>(vaddr,
+ std::max(wavefront->ldsChunk->read<c0>(vaddr),
+ (*e)));
+ break;
+ case Enums::MO_AMIN:
+ case Enums::MO_ANRMIN:
+ wavefront->ldsChunk->write<c0>(vaddr,
+ std::min(wavefront->ldsChunk->read<c0>(vaddr),
+ (*e)));
+ break;
+ case Enums::MO_AAND:
+ case Enums::MO_ANRAND:
+ wavefront->ldsChunk->write<c0>(vaddr,
+ wavefront->ldsChunk->read<c0>(vaddr) & (*e));
+ break;
+ case Enums::MO_AOR:
+ case Enums::MO_ANROR:
+ wavefront->ldsChunk->write<c0>(vaddr,
+ wavefront->ldsChunk->read<c0>(vaddr) | (*e));
+ break;
+ case Enums::MO_AXOR:
+ case Enums::MO_ANRXOR:
+ wavefront->ldsChunk->write<c0>(vaddr,
+ wavefront->ldsChunk->read<c0>(vaddr) ^ (*e));
+ break;
+ case Enums::MO_AINC:
+ case Enums::MO_ANRINC:
+ wavefront->ldsChunk->write<c0>(vaddr,
+ wavefront->ldsChunk->read<c0>(vaddr) + 1);
+ break;
+ case Enums::MO_ADEC:
+ case Enums::MO_ANRDEC:
+ wavefront->ldsChunk->write<c0>(vaddr,
+ wavefront->ldsChunk->read<c0>(vaddr) - 1);
+ break;
+ case Enums::MO_AEXCH:
+ case Enums::MO_ANREXCH:
+ wavefront->ldsChunk->write<c0>(vaddr, (*e));
+ break;
+ case Enums::MO_ACAS:
+ case Enums::MO_ANRCAS:
+ wavefront->ldsChunk->write<c0>(vaddr,
+ (wavefront->ldsChunk->read<c0>(vaddr) == (*e)) ?
+ (*f) : wavefront->ldsChunk->read<c0>(vaddr));
+ break;
+ default:
+ fatal("Unrecognized or invalid HSAIL atomic op "
+ "type.\n");
+ break;
+ }
+ } else {
+ Request *req =
+ new Request(0, vaddr, sizeof(c0), 0,
+ gpuDynInst->computeUnit()->masterId(),
+ 0, gpuDynInst->wfDynId, i,
+ gpuDynInst->makeAtomicOpFunctor<c0>(e,
+ f, this->opType));
+
+ gpuDynInst->setRequestFlags(req);
+ PacketPtr pkt = new Packet(req, MemCmd::SwapReq);
+ pkt->dataStatic(d);
+
+ if (gpuDynInst->computeUnit()->shader->
+ separate_acquire_release &&
+ (gpuDynInst->memoryOrder ==
+ Enums::MEMORY_ORDER_SC_ACQUIRE)) {
+ // if this atomic has acquire semantics,
+ // schedule the continuation to perform an
+ // acquire after the RMW completes
+ gpuDynInst->execContinuation =
+ &GPUStaticInst::execAtomicAcq;
+
+ gpuDynInst->useContinuation = true;
+ } else {
+ // the request will be finished when the RMW completes
+ gpuDynInst->useContinuation = false;
+ }
+ // translation is performed in sendRequest()
+ gpuDynInst->computeUnit()->sendRequest(gpuDynInst, i,
+ pkt);
+ }
+ }
+
+ ++d;
+ ++e;
+ ++f;
+ }
+
+ gpuDynInst->updateStats();
+ }
+
+ // execAtomicACq will always be called through a continuation.
+ // see comment for execContinuation in gpu_dyn_inst.hh
+ void
+ execAtomicAcq(GPUDynInstPtr gpuDynInst) override
+ {
+ // after performing the RMW, check to see if this instruction
+ // has acquire semantics, and if so, issue an acquire
+ if (!isLocalMem()) {
+ if (gpuDynInst->computeUnit()->shader->separate_acquire_release
+ && gpuDynInst->memoryOrder ==
+ Enums::MEMORY_ORDER_SC_ACQUIRE) {
+ gpuDynInst->statusBitVector = VectorMask(1);
+
+ // the request will be finished when
+ // the acquire completes
+ gpuDynInst->useContinuation = false;
+ // create request
+ Request *req = new Request(0, 0, 0, 0,
+ gpuDynInst->computeUnit()->masterId(),
+ 0, gpuDynInst->wfDynId, -1);
+ req->setFlags(Request::ACQUIRE);
+ gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req);
+ }
+ }
+ }
+ };
+
+ template<typename DataType, typename AddrOperandType, int NumSrcOperands>
+ GPUStaticInst*
+ constructAtomic(const Brig::BrigInstBase *ib, const BrigObject *obj)
+ {
+ const Brig::BrigInstAtomic *at = (const Brig::BrigInstAtomic*)ib;
+
+ if (at->atomicOperation == Brig::BRIG_ATOMIC_LD) {
+ return decodeLd<DataType>(ib, obj);
+ } else if (at->atomicOperation == Brig::BRIG_ATOMIC_ST) {
+ switch (ib->type) {
+ case Brig::BRIG_TYPE_B8:
+ return decodeSt<S8,S8>(ib, obj);
+ case Brig::BRIG_TYPE_B16:
+ return decodeSt<S8,S16>(ib, obj);
+ case Brig::BRIG_TYPE_B32:
+ return decodeSt<S8,S32>(ib, obj);
+ case Brig::BRIG_TYPE_B64:
+ return decodeSt<S8,S64>(ib, obj);
+ default: fatal("AtomicSt: Operand type mismatch %d\n", ib->type);
+ }
+ } else {
+ if ((Brig::BrigOpcode)ib->opcode == Brig::BRIG_OPCODE_ATOMICNORET)
+ return new AtomicInst<DataType, AddrOperandType,
+ NumSrcOperands, false>(ib, obj, "atomicnoret");
+ else
+ return new AtomicInst<DataType, AddrOperandType,
+ NumSrcOperands, true>(ib, obj, "atomic");
+ }
+ }
+
+ template<typename DataType, int NumSrcOperands>
+ GPUStaticInst*
+ decodeAtomicHelper(const Brig::BrigInstBase *ib, const BrigObject *obj)
+ {
+ unsigned addrIndex = (Brig::BrigOpcode)ib->opcode ==
+ Brig::BRIG_OPCODE_ATOMICNORET ? 0 : 1;
+
+ unsigned op_offs = obj->getOperandPtr(ib->operands,addrIndex);
+
+ BrigRegOperandInfo tmp = findRegDataType(op_offs, obj);
+
+ if (tmp.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) {
+ return constructAtomic<DataType, NoRegAddrOperand,
+ NumSrcOperands>(ib, obj);
+ } else if (tmp.kind == Brig::BRIG_KIND_OPERAND_REGISTER) {
+ // V2/V4 not allowed
+ switch (tmp.regKind) {
+ case Brig::BRIG_REGISTER_KIND_SINGLE:
+ return constructAtomic<DataType, SRegAddrOperand,
+ NumSrcOperands>(ib, obj);
+ case Brig::BRIG_REGISTER_KIND_DOUBLE:
+ return constructAtomic<DataType, DRegAddrOperand,
+ NumSrcOperands>(ib, obj);
+ default:
+ fatal("Bad atomic register operand type %d\n", tmp.type);
+ }
+ } else {
+ fatal("Bad atomic register operand kind %d\n", tmp.kind);
+ }
+ }
+
+
+ template<typename DataType>
+ GPUStaticInst*
+ decodeAtomic(const Brig::BrigInstBase *ib, const BrigObject *obj)
+ {
+ const Brig::BrigInstAtomic *at = (const Brig::BrigInstAtomic*)ib;
+
+ if (at->atomicOperation == Brig::BRIG_ATOMIC_CAS) {
+ return decodeAtomicHelper<DataType, 2>(ib, obj);
+ } else {
+ return decodeAtomicHelper<DataType, 1>(ib, obj);
+ }
+ }
+
+ template<typename DataType>
+ GPUStaticInst*
+ decodeAtomicNoRet(const Brig::BrigInstBase *ib, const BrigObject *obj)
+ {
+ const Brig::BrigInstAtomic *at = (const Brig::BrigInstAtomic*)ib;
+ if (at->atomicOperation == Brig::BRIG_ATOMIC_CAS) {
+ return decodeAtomicHelper<DataType, 2>(ib, obj);
+ } else {
+ return decodeAtomicHelper<DataType, 1>(ib, obj);
+ }
+ }
+} // namespace HsailISA
+
+#endif // __ARCH_HSAIL_INSTS_MEM_HH__
diff --git a/src/arch/hsail/insts/mem_impl.hh b/src/arch/hsail/insts/mem_impl.hh
new file mode 100644
index 000000000..94f0cd6aa
--- /dev/null
+++ b/src/arch/hsail/insts/mem_impl.hh
@@ -0,0 +1,660 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#include "arch/hsail/generic_types.hh"
+#include "gpu-compute/hsail_code.hh"
+
+// defined in code.cc, but not worth sucking in all of code.h for this
+// at this point
+extern const char *segmentNames[];
+
+namespace HsailISA
+{
+ template<typename DestDataType, typename AddrRegOperandType>
+ void
+ LdaInst<DestDataType, AddrRegOperandType>::generateDisassembly()
+ {
+ this->disassembly = csprintf("%s_%s %s,%s", this->opcode,
+ DestDataType::label,
+ this->dest.disassemble(),
+ this->addr.disassemble());
+ }
+
+ template<typename DestDataType, typename AddrRegOperandType>
+ void
+ LdaInst<DestDataType, AddrRegOperandType>::execute(GPUDynInstPtr gpuDynInst)
+ {
+ Wavefront *w = gpuDynInst->wavefront();
+
+ typedef typename DestDataType::CType CType M5_VAR_USED;
+ const VectorMask &mask = w->get_pred();
+ uint64_t addr_vec[VSZ];
+ this->addr.calcVector(w, addr_vec);
+
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (mask[lane]) {
+ this->dest.set(w, lane, addr_vec[lane]);
+ }
+ }
+ }
+
+ template<typename MemDataType, typename DestDataType,
+ typename AddrRegOperandType>
+ void
+ LdInst<MemDataType, DestDataType, AddrRegOperandType>::generateDisassembly()
+ {
+ switch (num_dest_operands) {
+ case 1:
+ this->disassembly = csprintf("%s_%s_%s %s,%s", this->opcode,
+ segmentNames[this->segment],
+ MemDataType::label,
+ this->dest.disassemble(),
+ this->addr.disassemble());
+ break;
+ case 2:
+ this->disassembly = csprintf("%s_%s_%s (%s,%s), %s", this->opcode,
+ segmentNames[this->segment],
+ MemDataType::label,
+ this->dest_vect[0].disassemble(),
+ this->dest_vect[1].disassemble(),
+ this->addr.disassemble());
+ break;
+ case 4:
+ this->disassembly = csprintf("%s_%s_%s (%s,%s,%s,%s), %s",
+ this->opcode,
+ segmentNames[this->segment],
+ MemDataType::label,
+ this->dest_vect[0].disassemble(),
+ this->dest_vect[1].disassemble(),
+ this->dest_vect[2].disassemble(),
+ this->dest_vect[3].disassemble(),
+ this->addr.disassemble());
+ break;
+ default:
+ fatal("Bad ld register dest operand, num vector operands: %d \n",
+ num_dest_operands);
+ break;
+ }
+ }
+
+ static Addr
+ calcPrivAddr(Addr addr, Wavefront *w, int lane, GPUStaticInst *i)
+ {
+ // what is the size of the object we are accessing??
+ // NOTE: the compiler doesn't generate enough information
+ // to do this yet..have to just line up all the private
+ // work-item spaces back to back for now
+ /*
+ StorageElement* se =
+ i->parent->findSymbol(Brig::BrigPrivateSpace, addr);
+ assert(se);
+
+ return w->wfSlotId * w->privSizePerItem * VSZ +
+ se->offset * VSZ +
+ lane * se->size;
+ */
+
+ // addressing strategy: interleave the private spaces of
+ // work-items in a wave-front on 8 byte granularity.
+ // this won't be perfect coalescing like the spill space
+ // strategy, but it's better than nothing. The spill space
+ // strategy won't work with private because the same address
+ // may be accessed by different sized loads/stores.
+
+ // Note: I'm assuming that the largest load/store to private
+ // is 8 bytes. If it is larger, the stride will have to increase
+
+ Addr addr_div8 = addr / 8;
+ Addr addr_mod8 = addr % 8;
+
+ Addr ret = addr_div8 * 8 * VSZ + lane * 8 + addr_mod8 + w->privBase;
+
+ assert(ret < w->privBase + (w->privSizePerItem * VSZ));
+
+ return ret;
+ }
+
+ template<typename MemDataType, typename DestDataType,
+ typename AddrRegOperandType>
+ void
+ LdInst<MemDataType, DestDataType,
+ AddrRegOperandType>::execute(GPUDynInstPtr gpuDynInst)
+ {
+ Wavefront *w = gpuDynInst->wavefront();
+
+ typedef typename MemDataType::CType MemCType;
+ const VectorMask &mask = w->get_pred();
+
+ // Kernarg references are handled uniquely for now (no Memory Request
+ // is used), so special-case them up front. Someday we should
+ // make this more realistic, at which we should get rid of this
+ // block and fold this case into the switch below.
+ if (this->segment == Brig::BRIG_SEGMENT_KERNARG) {
+ MemCType val;
+
+ // I assume no vector ld for kernargs
+ assert(num_dest_operands == 1);
+
+ // assuming for the moment that we'll never do register
+ // offsets into kernarg space... just to make life simpler
+ uint64_t address = this->addr.calcUniform();
+
+ val = *(MemCType*)&w->kernelArgs[address];
+
+ DPRINTF(HSAIL, "ld_kernarg [%d] -> %d\n", address, val);
+
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (mask[lane]) {
+ this->dest.set(w, lane, val);
+ }
+ }
+
+ return;
+ } else if (this->segment == Brig::BRIG_SEGMENT_ARG) {
+ uint64_t address = this->addr.calcUniform();
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (mask[lane]) {
+ MemCType val = w->readCallArgMem<MemCType>(lane, address);
+
+ DPRINTF(HSAIL, "ld_arg [%d] -> %llu\n", address,
+ (unsigned long long)val);
+
+ this->dest.set(w, lane, val);
+ }
+ }
+
+ return;
+ }
+
+ GPUDynInstPtr m = gpuDynInst;
+
+ this->addr.calcVector(w, m->addr);
+
+ m->m_op = Enums::MO_LD;
+ m->m_type = MemDataType::memType;
+ m->v_type = DestDataType::vgprType;
+
+ m->exec_mask = w->execMask();
+ m->statusBitVector = 0;
+ m->equiv = this->equivClass;
+ m->memoryOrder = getGenericMemoryOrder(this->memoryOrder);
+
+ m->scope = getGenericMemoryScope(this->memoryScope);
+
+ if (num_dest_operands == 1) {
+ m->dst_reg = this->dest.regIndex();
+ m->n_reg = 1;
+ } else {
+ m->n_reg = num_dest_operands;
+ for (int i = 0; i < num_dest_operands; ++i) {
+ m->dst_reg_vec[i] = this->dest_vect[i].regIndex();
+ }
+ }
+
+ m->simdId = w->simdId;
+ m->wfSlotId = w->wfSlotId;
+ m->wfDynId = w->wfDynId;
+ m->kern_id = w->kern_id;
+ m->cu_id = w->computeUnit->cu_id;
+ m->latency.init(&w->computeUnit->shader->tick_cnt);
+
+ switch (this->segment) {
+ case Brig::BRIG_SEGMENT_GLOBAL:
+ m->s_type = SEG_GLOBAL;
+ m->pipeId = GLBMEM_PIPE;
+ m->latency.set(w->computeUnit->shader->ticks(1));
+
+ // this is a complete hack to get around a compiler bug
+ // (the compiler currently generates global access for private
+ // addresses (starting from 0). We need to add the private offset)
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (m->addr[lane] < w->privSizePerItem) {
+ if (mask[lane]) {
+ // what is the size of the object we are accessing?
+ // find base for for this wavefront
+
+ // calcPrivAddr will fail if accesses are unaligned
+ assert(!((sizeof(MemCType) - 1) & m->addr[lane]));
+
+ Addr privAddr = calcPrivAddr(m->addr[lane], w, lane,
+ this);
+
+ m->addr[lane] = privAddr;
+ }
+ }
+ }
+
+ w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+ w->outstanding_reqs_rd_gm++;
+ w->rd_gm_reqs_in_pipe--;
+ break;
+
+ case Brig::BRIG_SEGMENT_SPILL:
+ assert(num_dest_operands == 1);
+ m->s_type = SEG_SPILL;
+ m->pipeId = GLBMEM_PIPE;
+ m->latency.set(w->computeUnit->shader->ticks(1));
+ {
+ for (int lane = 0; lane < VSZ; ++lane) {
+ // note: this calculation will NOT WORK if the compiler
+ // ever generates loads/stores to the same address with
+ // different widths (e.g., a ld_u32 addr and a ld_u16 addr)
+ if (mask[lane]) {
+ assert(m->addr[lane] < w->spillSizePerItem);
+
+ m->addr[lane] = m->addr[lane] * w->spillWidth +
+ lane * sizeof(MemCType) + w->spillBase;
+
+ w->last_addr[lane] = m->addr[lane];
+ }
+ }
+ }
+
+ w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+ w->outstanding_reqs_rd_gm++;
+ w->rd_gm_reqs_in_pipe--;
+ break;
+
+ case Brig::BRIG_SEGMENT_GROUP:
+ m->s_type = SEG_SHARED;
+ m->pipeId = LDSMEM_PIPE;
+ m->latency.set(w->computeUnit->shader->ticks(24));
+ w->computeUnit->localMemoryPipe.getLMReqFIFO().push(m);
+ w->outstanding_reqs_rd_lm++;
+ w->rd_lm_reqs_in_pipe--;
+ break;
+
+ case Brig::BRIG_SEGMENT_READONLY:
+ m->s_type = SEG_READONLY;
+ m->pipeId = GLBMEM_PIPE;
+ m->latency.set(w->computeUnit->shader->ticks(1));
+
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (mask[lane]) {
+ assert(m->addr[lane] + sizeof(MemCType) <= w->roSize);
+ m->addr[lane] += w->roBase;
+ }
+ }
+
+ w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+ w->outstanding_reqs_rd_gm++;
+ w->rd_gm_reqs_in_pipe--;
+ break;
+
+ case Brig::BRIG_SEGMENT_PRIVATE:
+ m->s_type = SEG_PRIVATE;
+ m->pipeId = GLBMEM_PIPE;
+ m->latency.set(w->computeUnit->shader->ticks(1));
+ {
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (mask[lane]) {
+ assert(m->addr[lane] < w->privSizePerItem);
+
+ m->addr[lane] = m->addr[lane] +
+ lane * sizeof(MemCType) + w->privBase;
+ }
+ }
+ }
+ w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+ w->outstanding_reqs_rd_gm++;
+ w->rd_gm_reqs_in_pipe--;
+ break;
+
+ default:
+ fatal("Load to unsupported segment %d %llxe\n", this->segment,
+ m->addr[0]);
+ }
+
+ w->outstanding_reqs++;
+ w->mem_reqs_in_pipe--;
+ }
+
+ template<typename OperationType, typename SrcDataType,
+ typename AddrRegOperandType>
+ void
+ StInst<OperationType, SrcDataType,
+ AddrRegOperandType>::execute(GPUDynInstPtr gpuDynInst)
+ {
+ Wavefront *w = gpuDynInst->wavefront();
+
+ typedef typename OperationType::CType CType;
+
+ const VectorMask &mask = w->get_pred();
+
+ // arg references are handled uniquely for now (no Memory Request
+ // is used), so special-case them up front. Someday we should
+ // make this more realistic, at which we should get rid of this
+ // block and fold this case into the switch below.
+ if (this->segment == Brig::BRIG_SEGMENT_ARG) {
+ uint64_t address = this->addr.calcUniform();
+
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (mask[lane]) {
+ CType data = this->src.template get<CType>(w, lane);
+ DPRINTF(HSAIL, "st_arg [%d] <- %d\n", address, data);
+ w->writeCallArgMem<CType>(lane, address, data);
+ }
+ }
+
+ return;
+ }
+
+ GPUDynInstPtr m = gpuDynInst;
+
+ m->exec_mask = w->execMask();
+
+ this->addr.calcVector(w, m->addr);
+
+ if (num_src_operands == 1) {
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (mask[lane]) {
+ ((CType*)m->d_data)[lane] =
+ this->src.template get<CType>(w, lane);
+ }
+ }
+ } else {
+ for (int k= 0; k < num_src_operands; ++k) {
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (mask[lane]) {
+ ((CType*)m->d_data)[k * VSZ + lane] =
+ this->src_vect[k].template get<CType>(w, lane);
+ }
+ }
+ }
+ }
+
+ m->m_op = Enums::MO_ST;
+ m->m_type = OperationType::memType;
+ m->v_type = OperationType::vgprType;
+
+ m->statusBitVector = 0;
+ m->equiv = this->equivClass;
+
+ if (num_src_operands == 1) {
+ m->n_reg = 1;
+ } else {
+ m->n_reg = num_src_operands;
+ }
+
+ m->memoryOrder = getGenericMemoryOrder(this->memoryOrder);
+
+ m->scope = getGenericMemoryScope(this->memoryScope);
+
+ m->simdId = w->simdId;
+ m->wfSlotId = w->wfSlotId;
+ m->wfDynId = w->wfDynId;
+ m->kern_id = w->kern_id;
+ m->cu_id = w->computeUnit->cu_id;
+ m->latency.init(&w->computeUnit->shader->tick_cnt);
+
+ switch (this->segment) {
+ case Brig::BRIG_SEGMENT_GLOBAL:
+ m->s_type = SEG_GLOBAL;
+ m->pipeId = GLBMEM_PIPE;
+ m->latency.set(w->computeUnit->shader->ticks(1));
+
+ // this is a complete hack to get around a compiler bug
+ // (the compiler currently generates global access for private
+ // addresses (starting from 0). We need to add the private offset)
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (mask[lane]) {
+ if (m->addr[lane] < w->privSizePerItem) {
+
+ // calcPrivAddr will fail if accesses are unaligned
+ assert(!((sizeof(CType)-1) & m->addr[lane]));
+
+ Addr privAddr = calcPrivAddr(m->addr[lane], w, lane,
+ this);
+
+ m->addr[lane] = privAddr;
+ }
+ }
+ }
+
+ w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+ w->outstanding_reqs_wr_gm++;
+ w->wr_gm_reqs_in_pipe--;
+ break;
+
+ case Brig::BRIG_SEGMENT_SPILL:
+ assert(num_src_operands == 1);
+ m->s_type = SEG_SPILL;
+ m->pipeId = GLBMEM_PIPE;
+ m->latency.set(w->computeUnit->shader->ticks(1));
+ {
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (mask[lane]) {
+ assert(m->addr[lane] < w->spillSizePerItem);
+
+ m->addr[lane] = m->addr[lane] * w->spillWidth +
+ lane * sizeof(CType) + w->spillBase;
+ }
+ }
+ }
+
+ w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+ w->outstanding_reqs_wr_gm++;
+ w->wr_gm_reqs_in_pipe--;
+ break;
+
+ case Brig::BRIG_SEGMENT_GROUP:
+ m->s_type = SEG_SHARED;
+ m->pipeId = LDSMEM_PIPE;
+ m->latency.set(w->computeUnit->shader->ticks(24));
+ w->computeUnit->localMemoryPipe.getLMReqFIFO().push(m);
+ w->outstanding_reqs_wr_lm++;
+ w->wr_lm_reqs_in_pipe--;
+ break;
+
+ case Brig::BRIG_SEGMENT_PRIVATE:
+ m->s_type = SEG_PRIVATE;
+ m->pipeId = GLBMEM_PIPE;
+ m->latency.set(w->computeUnit->shader->ticks(1));
+ {
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (mask[lane]) {
+ assert(m->addr[lane] < w->privSizePerItem);
+ m->addr[lane] = m->addr[lane] + lane *
+ sizeof(CType)+w->privBase;
+ }
+ }
+ }
+
+ w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+ w->outstanding_reqs_wr_gm++;
+ w->wr_gm_reqs_in_pipe--;
+ break;
+
+ default:
+ fatal("Store to unsupported segment %d\n", this->segment);
+ }
+
+ w->outstanding_reqs++;
+ w->mem_reqs_in_pipe--;
+ }
+
+ template<typename OperationType, typename SrcDataType,
+ typename AddrRegOperandType>
+ void
+ StInst<OperationType, SrcDataType,
+ AddrRegOperandType>::generateDisassembly()
+ {
+ switch (num_src_operands) {
+ case 1:
+ this->disassembly = csprintf("%s_%s_%s %s,%s", this->opcode,
+ segmentNames[this->segment],
+ OperationType::label,
+ this->src.disassemble(),
+ this->addr.disassemble());
+ break;
+ case 2:
+ this->disassembly = csprintf("%s_%s_%s (%s,%s), %s", this->opcode,
+ segmentNames[this->segment],
+ OperationType::label,
+ this->src_vect[0].disassemble(),
+ this->src_vect[1].disassemble(),
+ this->addr.disassemble());
+ break;
+ case 4:
+ this->disassembly = csprintf("%s_%s_%s (%s,%s,%s,%s), %s",
+ this->opcode,
+ segmentNames[this->segment],
+ OperationType::label,
+ this->src_vect[0].disassemble(),
+ this->src_vect[1].disassemble(),
+ this->src_vect[2].disassemble(),
+ this->src_vect[3].disassemble(),
+ this->addr.disassemble());
+ break;
+ default: fatal("Bad ld register src operand, num vector operands: "
+ "%d \n", num_src_operands);
+ break;
+ }
+ }
+
+ template<typename DataType, typename AddrRegOperandType, int NumSrcOperands,
+ bool HasDst>
+ void
+ AtomicInst<DataType, AddrRegOperandType, NumSrcOperands,
+ HasDst>::execute(GPUDynInstPtr gpuDynInst)
+ {
+ typedef typename DataType::CType CType;
+
+ Wavefront *w = gpuDynInst->wavefront();
+
+ GPUDynInstPtr m = gpuDynInst;
+
+ this->addr.calcVector(w, m->addr);
+
+ for (int lane = 0; lane < VSZ; ++lane) {
+ ((CType *)m->a_data)[lane] =
+ this->src[0].template get<CType>(w, lane);
+ }
+
+ // load second source operand for CAS
+ if (NumSrcOperands > 1) {
+ for (int lane = 0; lane < VSZ; ++lane) {
+ ((CType*)m->x_data)[lane] =
+ this->src[1].template get<CType>(w, lane);
+ }
+ }
+
+ assert(NumSrcOperands <= 2);
+
+ m->m_op = this->opType;
+ m->m_type = DataType::memType;
+ m->v_type = DataType::vgprType;
+
+ m->exec_mask = w->execMask();
+ m->statusBitVector = 0;
+ m->equiv = 0; // atomics don't have an equivalence class operand
+ m->n_reg = 1;
+ m->memoryOrder = getGenericMemoryOrder(this->memoryOrder);
+
+ m->scope = getGenericMemoryScope(this->memoryScope);
+
+ if (HasDst) {
+ m->dst_reg = this->dest.regIndex();
+ }
+
+ m->simdId = w->simdId;
+ m->wfSlotId = w->wfSlotId;
+ m->wfDynId = w->wfDynId;
+ m->kern_id = w->kern_id;
+ m->cu_id = w->computeUnit->cu_id;
+ m->latency.init(&w->computeUnit->shader->tick_cnt);
+
+ switch (this->segment) {
+ case Brig::BRIG_SEGMENT_GLOBAL:
+ m->s_type = SEG_GLOBAL;
+ m->latency.set(w->computeUnit->shader->ticks(64));
+ m->pipeId = GLBMEM_PIPE;
+
+ w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+ w->outstanding_reqs_wr_gm++;
+ w->wr_gm_reqs_in_pipe--;
+ w->outstanding_reqs_rd_gm++;
+ w->rd_gm_reqs_in_pipe--;
+ break;
+
+ case Brig::BRIG_SEGMENT_GROUP:
+ m->s_type = SEG_SHARED;
+ m->pipeId = LDSMEM_PIPE;
+ m->latency.set(w->computeUnit->shader->ticks(24));
+ w->computeUnit->localMemoryPipe.getLMReqFIFO().push(m);
+ w->outstanding_reqs_wr_lm++;
+ w->wr_lm_reqs_in_pipe--;
+ w->outstanding_reqs_rd_lm++;
+ w->rd_lm_reqs_in_pipe--;
+ break;
+
+ default:
+ fatal("Atomic op to unsupported segment %d\n",
+ this->segment);
+ }
+
+ w->outstanding_reqs++;
+ w->mem_reqs_in_pipe--;
+ }
+
+ const char* atomicOpToString(Brig::BrigAtomicOperation atomicOp);
+
+ template<typename DataType, typename AddrRegOperandType, int NumSrcOperands,
+ bool HasDst>
+ void
+ AtomicInst<DataType, AddrRegOperandType, NumSrcOperands,
+ HasDst>::generateDisassembly()
+ {
+ if (HasDst) {
+ this->disassembly =
+ csprintf("%s_%s_%s_%s %s,%s", this->opcode,
+ atomicOpToString(this->atomicOperation),
+ segmentNames[this->segment],
+ DataType::label, this->dest.disassemble(),
+ this->addr.disassemble());
+ } else {
+ this->disassembly =
+ csprintf("%s_%s_%s_%s %s", this->opcode,
+ atomicOpToString(this->atomicOperation),
+ segmentNames[this->segment],
+ DataType::label, this->addr.disassemble());
+ }
+
+ for (int i = 0; i < NumSrcOperands; ++i) {
+ this->disassembly += ",";
+ this->disassembly += this->src[i].disassemble();
+ }
+ }
+} // namespace HsailISA
diff --git a/src/arch/hsail/insts/pseudo_inst.cc b/src/arch/hsail/insts/pseudo_inst.cc
new file mode 100644
index 000000000..9506a80ab
--- /dev/null
+++ b/src/arch/hsail/insts/pseudo_inst.cc
@@ -0,0 +1,787 @@
+/*
+ * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Marc Orr
+ */
+
+#include <csignal>
+
+#include "arch/hsail/insts/decl.hh"
+#include "arch/hsail/insts/mem.hh"
+
+namespace HsailISA
+{
+ // Pseudo (or magic) instructions are overloaded on the hsail call
+ // instruction, because of its flexible parameter signature.
+
+ // To add a new magic instruction:
+ // 1. Add an entry to the enum.
+ // 2. Implement it in the switch statement below (Call::exec).
+ // 3. Add a utility function to hsa/hsail-gpu-compute/util/magicinst.h,
+ // so its easy to call from an OpenCL kernel.
+
+ // This enum should be identical to the enum in
+ // hsa/hsail-gpu-compute/util/magicinst.h
+ enum
+ {
+ MAGIC_PRINT_WF_32 = 0,
+ MAGIC_PRINT_WF_64,
+ MAGIC_PRINT_LANE,
+ MAGIC_PRINT_LANE_64,
+ MAGIC_PRINT_WF_FLOAT,
+ MAGIC_SIM_BREAK,
+ MAGIC_PREF_SUM,
+ MAGIC_REDUCTION,
+ MAGIC_MASKLANE_LOWER,
+ MAGIC_MASKLANE_UPPER,
+ MAGIC_JOIN_WF_BAR,
+ MAGIC_WAIT_WF_BAR,
+ MAGIC_PANIC,
+ MAGIC_ATOMIC_NR_ADD_GLOBAL_U32_REG,
+ MAGIC_ATOMIC_NR_ADD_GROUP_U32_REG,
+ MAGIC_LOAD_GLOBAL_U32_REG,
+ MAGIC_XACT_CAS_LD,
+ MAGIC_MOST_SIG_THD,
+ MAGIC_MOST_SIG_BROADCAST,
+ MAGIC_PRINT_WFID_32,
+ MAGIC_PRINT_WFID_64
+ };
+
+ void
+ Call::execPseudoInst(Wavefront *w, GPUDynInstPtr gpuDynInst)
+ {
+ const VectorMask &mask = w->get_pred();
+
+ int op = 0;
+ bool got_op = false;
+
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (mask[lane]) {
+ int src_val0 = src1.get<int>(w, lane, 0);
+ if (got_op) {
+ if (src_val0 != op) {
+ fatal("Multiple magic instructions per PC not "
+ "supported\n");
+ }
+ } else {
+ op = src_val0;
+ got_op = true;
+ }
+ }
+ }
+
+ switch(op) {
+ case MAGIC_PRINT_WF_32:
+ MagicPrintWF32(w);
+ break;
+ case MAGIC_PRINT_WF_64:
+ MagicPrintWF64(w);
+ break;
+ case MAGIC_PRINT_LANE:
+ MagicPrintLane(w);
+ break;
+ case MAGIC_PRINT_LANE_64:
+ MagicPrintLane64(w);
+ break;
+ case MAGIC_PRINT_WF_FLOAT:
+ MagicPrintWFFloat(w);
+ break;
+ case MAGIC_SIM_BREAK:
+ MagicSimBreak(w);
+ break;
+ case MAGIC_PREF_SUM:
+ MagicPrefixSum(w);
+ break;
+ case MAGIC_REDUCTION:
+ MagicReduction(w);
+ break;
+ case MAGIC_MASKLANE_LOWER:
+ MagicMaskLower(w);
+ break;
+ case MAGIC_MASKLANE_UPPER:
+ MagicMaskUpper(w);
+ break;
+ case MAGIC_JOIN_WF_BAR:
+ MagicJoinWFBar(w);
+ break;
+ case MAGIC_WAIT_WF_BAR:
+ MagicWaitWFBar(w);
+ break;
+ case MAGIC_PANIC:
+ MagicPanic(w);
+ break;
+
+ // atomic instructions
+ case MAGIC_ATOMIC_NR_ADD_GLOBAL_U32_REG:
+ MagicAtomicNRAddGlobalU32Reg(w, gpuDynInst);
+ break;
+
+ case MAGIC_ATOMIC_NR_ADD_GROUP_U32_REG:
+ MagicAtomicNRAddGroupU32Reg(w, gpuDynInst);
+ break;
+
+ case MAGIC_LOAD_GLOBAL_U32_REG:
+ MagicLoadGlobalU32Reg(w, gpuDynInst);
+ break;
+
+ case MAGIC_XACT_CAS_LD:
+ MagicXactCasLd(w);
+ break;
+
+ case MAGIC_MOST_SIG_THD:
+ MagicMostSigThread(w);
+ break;
+
+ case MAGIC_MOST_SIG_BROADCAST:
+ MagicMostSigBroadcast(w);
+ break;
+
+ case MAGIC_PRINT_WFID_32:
+ MagicPrintWF32ID(w);
+ break;
+
+ case MAGIC_PRINT_WFID_64:
+ MagicPrintWFID64(w);
+ break;
+
+ default: fatal("unrecognized magic instruction: %d\n", op);
+ }
+ }
+
+ void
+ Call::MagicPrintLane(Wavefront *w)
+ {
+ #if TRACING_ON
+ const VectorMask &mask = w->get_pred();
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (mask[lane]) {
+ int src_val1 = src1.get<int>(w, lane, 1);
+ int src_val2 = src1.get<int>(w, lane, 2);
+ if (src_val2) {
+ DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: 0x%x\n",
+ disassemble(), w->computeUnit->cu_id, w->simdId,
+ w->wfSlotId, lane, src_val1);
+ } else {
+ DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: %d\n",
+ disassemble(), w->computeUnit->cu_id, w->simdId,
+ w->wfSlotId, lane, src_val1);
+ }
+ }
+ }
+ #endif
+ }
+
+ void
+ Call::MagicPrintLane64(Wavefront *w)
+ {
+ #if TRACING_ON
+ const VectorMask &mask = w->get_pred();
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (mask[lane]) {
+ int64_t src_val1 = src1.get<int64_t>(w, lane, 1);
+ int src_val2 = src1.get<int>(w, lane, 2);
+ if (src_val2) {
+ DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: 0x%x\n",
+ disassemble(), w->computeUnit->cu_id, w->simdId,
+ w->wfSlotId, lane, src_val1);
+ } else {
+ DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: %d\n",
+ disassemble(), w->computeUnit->cu_id, w->simdId,
+ w->wfSlotId, lane, src_val1);
+ }
+ }
+ }
+ #endif
+ }
+
+ void
+ Call::MagicPrintWF32(Wavefront *w)
+ {
+ #if TRACING_ON
+ const VectorMask &mask = w->get_pred();
+ std::string res_str;
+ res_str = csprintf("krl_prt (%s)\n", disassemble());
+
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (!(lane & 7)) {
+ res_str += csprintf("DB%03d: ", (int)w->wfDynId);
+ }
+
+ if (mask[lane]) {
+ int src_val1 = src1.get<int>(w, lane, 1);
+ int src_val2 = src1.get<int>(w, lane, 2);
+
+ if (src_val2) {
+ res_str += csprintf("%08x", src_val1);
+ } else {
+ res_str += csprintf("%08d", src_val1);
+ }
+ } else {
+ res_str += csprintf("xxxxxxxx");
+ }
+
+ if ((lane & 7) == 7) {
+ res_str += csprintf("\n");
+ } else {
+ res_str += csprintf(" ");
+ }
+ }
+
+ res_str += "\n\n";
+ DPRINTFN(res_str.c_str());
+ #endif
+ }
+
+ void
+ Call::MagicPrintWF32ID(Wavefront *w)
+ {
+ #if TRACING_ON
+ const VectorMask &mask = w->get_pred();
+ std::string res_str;
+ int src_val3 = -1;
+ res_str = csprintf("krl_prt (%s)\n", disassemble());
+
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (!(lane & 7)) {
+ res_str += csprintf("DB%03d: ", (int)w->wfDynId);
+ }
+
+ if (mask[lane]) {
+ int src_val1 = src1.get<int>(w, lane, 1);
+ int src_val2 = src1.get<int>(w, lane, 2);
+ src_val3 = src1.get<int>(w, lane, 3);
+
+ if (src_val2) {
+ res_str += csprintf("%08x", src_val1);
+ } else {
+ res_str += csprintf("%08d", src_val1);
+ }
+ } else {
+ res_str += csprintf("xxxxxxxx");
+ }
+
+ if ((lane & 7) == 7) {
+ res_str += csprintf("\n");
+ } else {
+ res_str += csprintf(" ");
+ }
+ }
+
+ res_str += "\n\n";
+ if (w->wfDynId == src_val3) {
+ DPRINTFN(res_str.c_str());
+ }
+ #endif
+ }
+
+ void
+ Call::MagicPrintWF64(Wavefront *w)
+ {
+ #if TRACING_ON
+ const VectorMask &mask = w->get_pred();
+ std::string res_str;
+ res_str = csprintf("krl_prt (%s)\n", disassemble());
+
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (!(lane & 3)) {
+ res_str += csprintf("DB%03d: ", (int)w->wfDynId);
+ }
+
+ if (mask[lane]) {
+ int64_t src_val1 = src1.get<int64_t>(w, lane, 1);
+ int src_val2 = src1.get<int>(w, lane, 2);
+
+ if (src_val2) {
+ res_str += csprintf("%016x", src_val1);
+ } else {
+ res_str += csprintf("%016d", src_val1);
+ }
+ } else {
+ res_str += csprintf("xxxxxxxxxxxxxxxx");
+ }
+
+ if ((lane & 3) == 3) {
+ res_str += csprintf("\n");
+ } else {
+ res_str += csprintf(" ");
+ }
+ }
+
+ res_str += "\n\n";
+ DPRINTFN(res_str.c_str());
+ #endif
+ }
+
+ void
+ Call::MagicPrintWFID64(Wavefront *w)
+ {
+ #if TRACING_ON
+ const VectorMask &mask = w->get_pred();
+ std::string res_str;
+ int src_val3 = -1;
+ res_str = csprintf("krl_prt (%s)\n", disassemble());
+
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (!(lane & 3)) {
+ res_str += csprintf("DB%03d: ", (int)w->wfDynId);
+ }
+
+ if (mask[lane]) {
+ int64_t src_val1 = src1.get<int64_t>(w, lane, 1);
+ int src_val2 = src1.get<int>(w, lane, 2);
+ src_val3 = src1.get<int>(w, lane, 3);
+
+ if (src_val2) {
+ res_str += csprintf("%016x", src_val1);
+ } else {
+ res_str += csprintf("%016d", src_val1);
+ }
+ } else {
+ res_str += csprintf("xxxxxxxxxxxxxxxx");
+ }
+
+ if ((lane & 3) == 3) {
+ res_str += csprintf("\n");
+ } else {
+ res_str += csprintf(" ");
+ }
+ }
+
+ res_str += "\n\n";
+ if (w->wfDynId == src_val3) {
+ DPRINTFN(res_str.c_str());
+ }
+ #endif
+ }
+
+ void
+ Call::MagicPrintWFFloat(Wavefront *w)
+ {
+ #if TRACING_ON
+ const VectorMask &mask = w->get_pred();
+ std::string res_str;
+ res_str = csprintf("krl_prt (%s)\n", disassemble());
+
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (!(lane & 7)) {
+ res_str += csprintf("DB%03d: ", (int)w->wfDynId);
+ }
+
+ if (mask[lane]) {
+ float src_val1 = src1.get<float>(w, lane, 1);
+ res_str += csprintf("%08f", src_val1);
+ } else {
+ res_str += csprintf("xxxxxxxx");
+ }
+
+ if ((lane & 7) == 7) {
+ res_str += csprintf("\n");
+ } else {
+ res_str += csprintf(" ");
+ }
+ }
+
+ res_str += "\n\n";
+ DPRINTFN(res_str.c_str());
+ #endif
+ }
+
+ // raises a signal that GDB will catch
+ // when done with the break, type "signal 0" in gdb to continue
+ void
+ Call::MagicSimBreak(Wavefront *w)
+ {
+ std::string res_str;
+ // print out state for this wavefront and then break
+ res_str = csprintf("Breakpoint encountered for wavefront %i\n",
+ w->wfSlotId);
+
+ res_str += csprintf(" Kern ID: %i\n", w->kern_id);
+ res_str += csprintf(" Phase ID: %i\n", w->simdId);
+ res_str += csprintf(" Executing on CU #%i\n", w->computeUnit->cu_id);
+ res_str += csprintf(" Exec mask: ");
+
+ for (int i = VSZ - 1; i >= 0; --i) {
+ if (w->execMask(i))
+ res_str += "1";
+ else
+ res_str += "0";
+
+ if ((i & 7) == 7)
+ res_str += " ";
+ }
+
+ res_str += csprintf("(0x%016llx)\n", w->execMask().to_ullong());
+
+ res_str += "\nHelpful debugging hints:\n";
+ res_str += " Check out w->s_reg / w->d_reg for register state\n";
+
+ res_str += "\n\n";
+ DPRINTFN(res_str.c_str());
+ fflush(stdout);
+
+ raise(SIGTRAP);
+ }
+
+ void
+ Call::MagicPrefixSum(Wavefront *w)
+ {
+ const VectorMask &mask = w->get_pred();
+ int res = 0;
+
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (mask[lane]) {
+ int src_val1 = src1.get<int>(w, lane, 1);
+ dest.set<int>(w, lane, res);
+ res += src_val1;
+ }
+ }
+ }
+
+ void
+ Call::MagicReduction(Wavefront *w)
+ {
+ // reduction magic instruction
+ // The reduction instruction takes up to 64 inputs (one from
+ // each thread in a WF) and sums them. It returns the sum to
+ // each thread in the WF.
+ const VectorMask &mask = w->get_pred();
+ int res = 0;
+
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (mask[lane]) {
+ int src_val1 = src1.get<int>(w, lane, 1);
+ res += src_val1;
+ }
+ }
+
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (mask[lane]) {
+ dest.set<int>(w, lane, res);
+ }
+ }
+ }
+
+ void
+ Call::MagicMaskLower(Wavefront *w)
+ {
+ const VectorMask &mask = w->get_pred();
+ int res = 0;
+
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (mask[lane]) {
+ int src_val1 = src1.get<int>(w, lane, 1);
+
+ if (src_val1) {
+ if (lane < (VSZ/2)) {
+ res = res | ((uint32_t)(1) << lane);
+ }
+ }
+ }
+ }
+
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (mask[lane]) {
+ dest.set<int>(w, lane, res);
+ }
+ }
+ }
+
+ void
+ Call::MagicMaskUpper(Wavefront *w)
+ {
+ const VectorMask &mask = w->get_pred();
+ int res = 0;
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (mask[lane]) {
+ int src_val1 = src1.get<int>(w, lane, 1);
+
+ if (src_val1) {
+ if (lane >= (VSZ/2)) {
+ res = res | ((uint32_t)(1) << (lane - (VSZ/2)));
+ }
+ }
+ }
+ }
+
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (mask[lane]) {
+ dest.set<int>(w, lane, res);
+ }
+ }
+ }
+
+ void
+ Call::MagicJoinWFBar(Wavefront *w)
+ {
+ const VectorMask &mask = w->get_pred();
+ int max_cnt = 0;
+
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (mask[lane]) {
+ w->bar_cnt[lane]++;
+
+ if (w->bar_cnt[lane] > max_cnt) {
+ max_cnt = w->bar_cnt[lane];
+ }
+ }
+ }
+
+ if (max_cnt > w->max_bar_cnt) {
+ w->max_bar_cnt = max_cnt;
+ }
+ }
+
+ void
+ Call::MagicWaitWFBar(Wavefront *w)
+ {
+ const VectorMask &mask = w->get_pred();
+ int max_cnt = 0;
+
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (mask[lane]) {
+ w->bar_cnt[lane]--;
+ }
+
+ if (w->bar_cnt[lane] > max_cnt) {
+ max_cnt = w->bar_cnt[lane];
+ }
+ }
+
+ if (max_cnt < w->max_bar_cnt) {
+ w->max_bar_cnt = max_cnt;
+ }
+
+ w->instructionBuffer.erase(w->instructionBuffer.begin() + 1,
+ w->instructionBuffer.end());
+ if (w->pendingFetch)
+ w->dropFetch = true;
+ }
+
+ void
+ Call::MagicPanic(Wavefront *w)
+ {
+ const VectorMask &mask = w->get_pred();
+
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (mask[lane]) {
+ int src_val1 = src1.get<int>(w, lane, 1);
+ panic("OpenCL Code failed assertion #%d. Triggered by lane %s",
+ src_val1, lane);
+ }
+ }
+ }
+
+ void
+ Call::calcAddr(Wavefront *w, GPUDynInstPtr m)
+ {
+ // the address is in src1 | src2
+ for (int lane = 0; lane < VSZ; ++lane) {
+ int src_val1 = src1.get<int>(w, lane, 1);
+ int src_val2 = src1.get<int>(w, lane, 2);
+ Addr addr = (((Addr) src_val1) << 32) | ((Addr) src_val2);
+
+ m->addr[lane] = addr;
+ }
+
+ }
+
+ void
+ Call::MagicAtomicNRAddGlobalU32Reg(Wavefront *w, GPUDynInstPtr gpuDynInst)
+ {
+ GPUDynInstPtr m = gpuDynInst;
+
+ calcAddr(w, m);
+
+ for (int lane = 0; lane < VSZ; ++lane) {
+ ((int*)m->a_data)[lane] = src1.get<int>(w, lane, 3);
+ }
+
+ m->m_op = brigAtomicToMemOpType(Brig::BRIG_OPCODE_ATOMICNORET,
+ Brig::BRIG_ATOMIC_ADD);
+ m->m_type = U32::memType;
+ m->v_type = U32::vgprType;
+
+ m->exec_mask = w->execMask();
+ m->statusBitVector = 0;
+ m->equiv = 0; // atomics don't have an equivalence class operand
+ m->n_reg = 1;
+ m->memoryOrder = Enums::MEMORY_ORDER_NONE;
+ m->scope = Enums::MEMORY_SCOPE_NONE;
+
+ m->simdId = w->simdId;
+ m->wfSlotId = w->wfSlotId;
+ m->wfDynId = w->wfDynId;
+ m->latency.init(&w->computeUnit->shader->tick_cnt);
+
+ m->s_type = SEG_GLOBAL;
+ m->pipeId = GLBMEM_PIPE;
+ m->latency.set(w->computeUnit->shader->ticks(64));
+ w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+ w->outstanding_reqs_wr_gm++;
+ w->wr_gm_reqs_in_pipe--;
+ w->outstanding_reqs_rd_gm++;
+ w->rd_gm_reqs_in_pipe--;
+ w->outstanding_reqs++;
+ w->mem_reqs_in_pipe--;
+ }
+
+ void
+ Call::MagicAtomicNRAddGroupU32Reg(Wavefront *w, GPUDynInstPtr gpuDynInst)
+ {
+ GPUDynInstPtr m = gpuDynInst;
+ calcAddr(w, m);
+
+ for (int lane = 0; lane < VSZ; ++lane) {
+ ((int*)m->a_data)[lane] = src1.get<int>(w, lane, 1);
+ }
+
+ m->m_op = brigAtomicToMemOpType(Brig::BRIG_OPCODE_ATOMICNORET,
+ Brig::BRIG_ATOMIC_ADD);
+ m->m_type = U32::memType;
+ m->v_type = U32::vgprType;
+
+ m->exec_mask = w->execMask();
+ m->statusBitVector = 0;
+ m->equiv = 0; // atomics don't have an equivalence class operand
+ m->n_reg = 1;
+ m->memoryOrder = Enums::MEMORY_ORDER_NONE;
+ m->scope = Enums::MEMORY_SCOPE_NONE;
+
+ m->simdId = w->simdId;
+ m->wfSlotId = w->wfSlotId;
+ m->wfDynId = w->wfDynId;
+ m->latency.init(&w->computeUnit->shader->tick_cnt);
+
+ m->s_type = SEG_GLOBAL;
+ m->pipeId = GLBMEM_PIPE;
+ m->latency.set(w->computeUnit->shader->ticks(64));
+ w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+ w->outstanding_reqs_wr_gm++;
+ w->wr_gm_reqs_in_pipe--;
+ w->outstanding_reqs_rd_gm++;
+ w->rd_gm_reqs_in_pipe--;
+ w->outstanding_reqs++;
+ w->mem_reqs_in_pipe--;
+ }
+
+ void
+ Call::MagicLoadGlobalU32Reg(Wavefront *w, GPUDynInstPtr gpuDynInst)
+ {
+ GPUDynInstPtr m = gpuDynInst;
+ // calculate the address
+ calcAddr(w, m);
+
+ m->m_op = Enums::MO_LD;
+ m->m_type = U32::memType; //MemDataType::memType;
+ m->v_type = U32::vgprType; //DestDataType::vgprType;
+
+ m->exec_mask = w->execMask();
+ m->statusBitVector = 0;
+ m->equiv = 0;
+ m->n_reg = 1;
+ m->memoryOrder = Enums::MEMORY_ORDER_NONE;
+ m->scope = Enums::MEMORY_SCOPE_NONE;
+
+ // FIXME
+ //m->dst_reg = this->dest.regIndex();
+
+ m->simdId = w->simdId;
+ m->wfSlotId = w->wfSlotId;
+ m->wfDynId = w->wfDynId;
+ m->latency.init(&w->computeUnit->shader->tick_cnt);
+
+ m->s_type = SEG_GLOBAL;
+ m->pipeId = GLBMEM_PIPE;
+ m->latency.set(w->computeUnit->shader->ticks(1));
+ w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+ w->outstanding_reqs_rd_gm++;
+ w->rd_gm_reqs_in_pipe--;
+ w->outstanding_reqs++;
+ w->mem_reqs_in_pipe--;
+ }
+
+ void
+ Call::MagicXactCasLd(Wavefront *w)
+ {
+ const VectorMask &mask = w->get_pred();
+ int src_val1 = 0;
+
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (mask[lane]) {
+ src_val1 = src1.get<int>(w, lane, 1);
+ break;
+ }
+ }
+
+ if (!w->computeUnit->xactCasLoadMap.count(src_val1)) {
+ w->computeUnit->xactCasLoadMap[src_val1] = ComputeUnit::waveQueue();
+ w->computeUnit->xactCasLoadMap[src_val1].waveIDQueue.clear();
+ }
+
+ w->computeUnit->xactCasLoadMap[src_val1].waveIDQueue
+ .push_back(ComputeUnit::waveIdentifier(w->simdId, w->wfSlotId));
+ }
+
+ void
+ Call::MagicMostSigThread(Wavefront *w)
+ {
+ const VectorMask &mask = w->get_pred();
+ unsigned mst = true;
+
+ for (int lane = VSZ - 1; lane >= 0; --lane) {
+ if (mask[lane]) {
+ dest.set<int>(w, lane, mst);
+ mst = false;
+ }
+ }
+ }
+
+ void
+ Call::MagicMostSigBroadcast(Wavefront *w)
+ {
+ const VectorMask &mask = w->get_pred();
+ int res = 0;
+ bool got_res = false;
+
+ for (int lane = VSZ - 1; lane >= 0; --lane) {
+ if (mask[lane]) {
+ if (!got_res) {
+ res = src1.get<int>(w, lane, 1);
+ got_res = true;
+ }
+ dest.set<int>(w, lane, res);
+ }
+ }
+ }
+
+} // namespace HsailISA