10 files changed, 5186 insertions, 0 deletions
diff --git a/src/arch/hsail/insts/branch.cc b/src/arch/hsail/insts/branch.cc
new file mode 100644
index 000000000..d65279cc8
--- /dev/null
+++ b/src/arch/hsail/insts/branch.cc
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#include "arch/hsail/insts/branch.hh"
+
+#include "gpu-compute/hsail_code.hh"
+
+namespace HsailISA
+{
+    GPUStaticInst*
+    decodeBrn(const Brig::BrigInstBase *ib, const BrigObject *obj)
+    {
+        // Detect direct vs indirect branch by seeing whether we have a
+        // register operand.
+        unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+        const Brig::BrigOperand *reg = obj->getOperand(op_offs);
+
+        if (reg->kind == Brig::BRIG_KIND_OPERAND_REGISTER) {
+            return new BrnIndirectInst(ib, obj);
+        } else {
+            return new BrnDirectInst(ib, obj);
+        }
+    }
+
+    GPUStaticInst*
+    decodeCbr(const Brig::BrigInstBase *ib, const BrigObject *obj)
+    {
+        // Detect direct vs indirect branch by seeing whether we have a
+        // second register operand (after the condition).
+        unsigned op_offs = obj->getOperandPtr(ib->operands, 1);
+        const Brig::BrigOperand *reg = obj->getOperand(op_offs);
+
+        if (reg->kind == Brig::BRIG_KIND_OPERAND_REGISTER) {
+            return new CbrIndirectInst(ib, obj);
+        } else {
+            return new CbrDirectInst(ib, obj);
+        }
+    }
+
+    GPUStaticInst*
+    decodeBr(const Brig::BrigInstBase *ib, const BrigObject *obj)
+    {
+        // Detect direct vs indirect branch by seeing whether we have a
+        // second register operand (after the condition).
+        unsigned op_offs = obj->getOperandPtr(ib->operands, 1);
+        const Brig::BrigOperand *reg = obj->getOperand(op_offs);
+
+        if (reg->kind == Brig::BRIG_KIND_OPERAND_REGISTER) {
+            return new BrIndirectInst(ib, obj);
+        } else {
+            return new BrDirectInst(ib, obj);
+        }
+    }
+} // namespace HsailISA
diff --git a/src/arch/hsail/insts/branch.hh b/src/arch/hsail/insts/branch.hh
new file mode 100644
index 000000000..54ad9a042
--- /dev/null
+++ b/src/arch/hsail/insts/branch.hh
@@ -0,0 +1,442 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#ifndef __ARCH_HSAIL_INSTS_BRANCH_HH__
+#define __ARCH_HSAIL_INSTS_BRANCH_HH__
+
+#include "arch/hsail/insts/gpu_static_inst.hh"
+#include "arch/hsail/operand.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/wavefront.hh"
+
+namespace HsailISA
+{
+
+    // The main difference between a direct branch and an indirect branch
+    // is whether the target is a register or a label, so we can share a
+    // lot of code if we template the base implementation on that type.
+    template<typename TargetType>
+    class BrnInstBase : public HsailGPUStaticInst
+    {
+    public:
+        void generateDisassembly();
+
+        Brig::BrigWidth8_t width;
+        TargetType target;
+
+        BrnInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj)
+           : HsailGPUStaticInst(obj, "brn")
+        {
+            o_type = Enums::OT_BRANCH;
+            width = ((Brig::BrigInstBr*)ib)->width;
+            unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+            target.init(op_offs, obj);
+            o_type = Enums::OT_BRANCH;
+        }
+
+        uint32_t getTargetPc()  override { return target.getTarget(0, 0); }
+
+        bool unconditionalJumpInstruction() override { return true; }
+        bool isVectorRegister(int operandIndex) {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            return target.isVectorRegister();
+        }
+        bool isCondRegister(int operandIndex) {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            return target.isCondRegister();
+        }
+        bool isScalarRegister(int operandIndex) {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            return target.isScalarRegister();
+        }
+
+        bool isSrcOperand(int operandIndex) {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            return true;
+        }
+
+        bool isDstOperand(int operandIndex) {
+            return false;
+        }
+
+        int getOperandSize(int operandIndex) {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            return target.opSize();
+        }
+
+        int getRegisterIndex(int operandIndex) {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            return target.regIndex();
+        }
+
+        int getNumOperands() {
+            return 1;
+        }
+
+        void execute(GPUDynInstPtr gpuDynInst);
+    };
+
+    template<typename TargetType>
+    void
+    BrnInstBase<TargetType>::generateDisassembly()
+    {
+        std::string widthClause;
+
+        if (width != 1) {
+            widthClause = csprintf("_width(%d)", width);
+        }
+
+        disassembly = csprintf("%s%s %s", opcode, widthClause,
+                               target.disassemble());
+    }
+
+    template<typename TargetType>
+    void
+    BrnInstBase<TargetType>::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *w = gpuDynInst->wavefront();
+
+        if (getTargetPc() == w->rpc()) {
+            w->popFromReconvergenceStack();
+        } else {
+            // Rpc and execution mask remain the same
+            w->pc(getTargetPc());
+        }
+        w->discardFetch();
+    }
+
+    class BrnDirectInst : public BrnInstBase<LabelOperand>
+    {
+      public:
+        BrnDirectInst(const Brig::BrigInstBase *ib, const BrigObject *obj)
+            : BrnInstBase<LabelOperand>(ib, obj)
+        {
+        }
+        int numSrcRegOperands() { return 0; }
+        int numDstRegOperands() { return 0; }
+    };
+
+    class BrnIndirectInst : public BrnInstBase<SRegOperand>
+    {
+      public:
+        BrnIndirectInst(const Brig::BrigInstBase *ib, const BrigObject *obj)
+            : BrnInstBase<SRegOperand>(ib, obj)
+        {
+        }
+        int numSrcRegOperands() { return target.isVectorRegister(); }
+        int numDstRegOperands() { return 0; }
+    };
+
+    GPUStaticInst* decodeBrn(const Brig::BrigInstBase *ib,
+                             const BrigObject *obj);
+
+    template<typename TargetType>
+    class CbrInstBase : public HsailGPUStaticInst
+    {
+      public:
+        void generateDisassembly();
+
+        Brig::BrigWidth8_t width;
+        CRegOperand cond;
+        TargetType target;
+
+        CbrInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj)
+           : HsailGPUStaticInst(obj, "cbr")
+        {
+            o_type = Enums::OT_BRANCH;
+            width = ((Brig::BrigInstBr *)ib)->width;
+            unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+            cond.init(op_offs, obj);
+            op_offs = obj->getOperandPtr(ib->operands, 1);
+            target.init(op_offs, obj);
+            o_type = Enums::OT_BRANCH;
+        }
+
+        uint32_t getTargetPc() override { return target.getTarget(0, 0); }
+
+        void execute(GPUDynInstPtr gpuDynInst);
+        // Assumption: Target is operand 0, Condition Register is operand 1
+        bool isVectorRegister(int operandIndex) {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            if (!operandIndex)
+                return target.isVectorRegister();
+            else
+                return false;
+        }
+        bool isCondRegister(int operandIndex) {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            if (!operandIndex)
+                return target.isCondRegister();
+            else
+                return true;
+        }
+        bool isScalarRegister(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (!operandIndex)
+                return target.isScalarRegister();
+            else
+                return false;
+        }
+        bool isSrcOperand(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex == 0)
+                return true;
+            return false;
+        }
+        // both Condition Register and Target are source operands
+        bool isDstOperand(int operandIndex) {
+            return false;
+        }
+        int getOperandSize(int operandIndex) {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            if (!operandIndex)
+                return target.opSize();
+            else
+                return 1;
+        }
+        int getRegisterIndex(int operandIndex) {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            if (!operandIndex)
+                return target.regIndex();
+            else
+                return -1;
+         }
+
+        // Operands = Target, Condition Register
+        int getNumOperands() {
+            return 2;
+        }
+    };
+
+    template<typename TargetType>
+    void
+    CbrInstBase<TargetType>::generateDisassembly()
+    {
+        std::string widthClause;
+
+        if (width != 1) {
+            widthClause = csprintf("_width(%d)", width);
+        }
+
+        disassembly = csprintf("%s%s %s,%s", opcode, widthClause,
+                               cond.disassemble(), target.disassemble());
+    }
+
+    template<typename TargetType>
+    void
+    CbrInstBase<TargetType>::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *w = gpuDynInst->wavefront();
+
+        const uint32_t curr_pc = w->pc();
+        const uint32_t curr_rpc = w->rpc();
+        const VectorMask curr_mask = w->execMask();
+
+        /**
+         * TODO: can we move this pop outside the instruction, and
+         * into the wavefront?
+         */
+        w->popFromReconvergenceStack();
+
+        // immediate post-dominator instruction
+        const uint32_t rpc = static_cast<uint32_t>(ipdInstNum());
+        if (curr_rpc != rpc) {
+            w->pushToReconvergenceStack(rpc, curr_rpc, curr_mask);
+        }
+
+        // taken branch
+        const uint32_t true_pc = getTargetPc();
+        VectorMask true_mask;
+        for (unsigned int lane = 0; lane < VSZ; ++lane) {
+            true_mask[lane] = cond.get<bool>(w, lane) & curr_mask[lane];
+        }
+
+        // not taken branch
+        const uint32_t false_pc = curr_pc + 1;
+        assert(true_pc != false_pc);
+        if (false_pc != rpc && true_mask.count() < curr_mask.count()) {
+            VectorMask false_mask = curr_mask & ~true_mask;
+            w->pushToReconvergenceStack(false_pc, rpc, false_mask);
+        }
+
+        if (true_pc != rpc && true_mask.count()) {
+            w->pushToReconvergenceStack(true_pc, rpc, true_mask);
+        }
+        assert(w->pc() != curr_pc);
+        w->discardFetch();
+    }
+
+
+    class CbrDirectInst : public CbrInstBase<LabelOperand>
+    {
+      public:
+        CbrDirectInst(const Brig::BrigInstBase *ib, const BrigObject *obj)
+            : CbrInstBase<LabelOperand>(ib, obj)
+        {
+        }
+        // the source operand of a conditional branch is a Condition
+        // Register which is not stored in the VRF
+        // so we do not count it as a source-register operand
+        // even though, formally, it is one.
+        int numSrcRegOperands() { return 0; }
+        int numDstRegOperands() { return 0; }
+    };
+
+    class CbrIndirectInst : public CbrInstBase<SRegOperand>
+    {
+      public:
+        CbrIndirectInst(const Brig::BrigInstBase *ib, const BrigObject *obj)
+            : CbrInstBase<SRegOperand>(ib, obj)
+        {
+        }
+        // one source operand of the conditional indirect branch is a Condition
+        // register which is not stored in the VRF so we do not count it
+        // as a source-register operand even though, formally, it is one.
+        int numSrcRegOperands() { return target.isVectorRegister(); }
+        int numDstRegOperands() { return 0; }
+    };
+
+    GPUStaticInst* decodeCbr(const Brig::BrigInstBase *ib,
+                             const BrigObject *obj);
+
+    template<typename TargetType>
+    class BrInstBase : public HsailGPUStaticInst
+    {
+      public:
+        void generateDisassembly();
+
+        ImmOperand<uint32_t> width;
+        TargetType target;
+
+        BrInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj)
+           : HsailGPUStaticInst(obj, "br")
+        {
+            o_type = Enums::OT_BRANCH;
+            width.init(((Brig::BrigInstBr *)ib)->width, obj);
+            unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+            target.init(op_offs, obj);
+            o_type = Enums::OT_BRANCH;
+        }
+
+        uint32_t getTargetPc() override { return target.getTarget(0, 0); }
+
+        bool unconditionalJumpInstruction() override { return true; }
+
+        void execute(GPUDynInstPtr gpuDynInst);
+        bool isVectorRegister(int operandIndex) {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            return target.isVectorRegister();
+        }
+        bool isCondRegister(int operandIndex) {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            return target.isCondRegister();
+        }
+        bool isScalarRegister(int operandIndex) {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            return target.isScalarRegister();
+        }
+        bool isSrcOperand(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return true;
+        }
+        bool isDstOperand(int operandIndex) { return false; }
+        int getOperandSize(int operandIndex) {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            return target.opSize();
+        }
+        int getRegisterIndex(int operandIndex) {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            return target.regIndex();
+        }
+        int getNumOperands() { return 1; }
+    };
+
+    template<typename TargetType>
+    void
+    BrInstBase<TargetType>::generateDisassembly()
+    {
+        std::string widthClause;
+
+        if (width.bits != 1) {
+            widthClause = csprintf("_width(%d)", width.bits);
+        }
+
+        disassembly = csprintf("%s%s %s", opcode, widthClause,
+                               target.disassemble());
+    }
+
+    template<typename TargetType>
+    void
+    BrInstBase<TargetType>::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *w = gpuDynInst->wavefront();
+
+        if (getTargetPc() == w->rpc()) {
+            w->popFromReconvergenceStack();
+        } else {
+            // Rpc and execution mask remain the same
+            w->pc(getTargetPc());
+        }
+        w->discardFetch();
+    }
+
+    class BrDirectInst : public BrInstBase<LabelOperand>
+    {
+      public:
+        BrDirectInst(const Brig::BrigInstBase *ib, const BrigObject *obj)
+            : BrInstBase<LabelOperand>(ib, obj)
+        {
+        }
+
+        int numSrcRegOperands() { return 0; }
+        int numDstRegOperands() { return 0; }
+    };
+
+    class BrIndirectInst : public BrInstBase<SRegOperand>
+    {
+      public:
+        BrIndirectInst(const Brig::BrigInstBase *ib, const BrigObject *obj)
+            : BrInstBase<SRegOperand>(ib, obj)
+        {
+        }
+        int numSrcRegOperands() { return target.isVectorRegister(); }
+        int numDstRegOperands() { return 0; }
+    };
+
+    GPUStaticInst* decodeBr(const Brig::BrigInstBase *ib,
+                            const BrigObject *obj);
+} // namespace HsailISA
+
+#endif // __ARCH_HSAIL_INSTS_BRANCH_HH__
diff --git a/src/arch/hsail/insts/decl.hh b/src/arch/hsail/insts/decl.hh
new file mode 100644
index 000000000..e2da501b9
--- /dev/null
+++ b/src/arch/hsail/insts/decl.hh
@@ -0,0 +1,1106 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#ifndef __ARCH_HSAIL_INSTS_DECL_HH__
+#define __ARCH_HSAIL_INSTS_DECL_HH__
+
+#include <cmath>
+
+#include "arch/hsail/generic_types.hh"
+#include "arch/hsail/insts/gpu_static_inst.hh"
+#include "arch/hsail/operand.hh"
+#include "debug/HSAIL.hh"
+#include "enums/OpType.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/shader.hh"
+
+namespace HsailISA
+{
+    template<typename _DestOperand, typename _SrcOperand>
+    class HsailOperandType
+    {
+      public:
+        typedef _DestOperand DestOperand;
+        typedef _SrcOperand SrcOperand;
+    };
+
+    typedef HsailOperandType<CRegOperand, CRegOrImmOperand> CRegOperandType;
+    typedef HsailOperandType<SRegOperand, SRegOrImmOperand> SRegOperandType;
+    typedef HsailOperandType<DRegOperand, DRegOrImmOperand> DRegOperandType;
+
+    // The IsBits parameter serves only to disambiguate tbhe B* types from
+    // the U* types, which otherwise would be identical (and
+    // indistinguishable).
+    template<typename _OperandType, typename _CType, Enums::MemType _memType,
+             vgpr_type _vgprType, int IsBits=0>
+    class HsailDataType
+    {
+      public:
+        typedef _OperandType OperandType;
+        typedef _CType CType;
+        static const Enums::MemType memType = _memType;
+        static const vgpr_type vgprType = _vgprType;
+        static const char *label;
+    };
+
+    typedef HsailDataType<CRegOperandType, bool, Enums::M_U8, VT_32, 1> B1;
+    typedef HsailDataType<SRegOperandType, uint8_t, Enums::M_U8, VT_32, 1> B8;
+
+    typedef HsailDataType<SRegOperandType, uint16_t,
+                          Enums::M_U16, VT_32, 1> B16;
+
+    typedef HsailDataType<SRegOperandType, uint32_t,
+                          Enums::M_U32, VT_32, 1> B32;
+
+    typedef HsailDataType<DRegOperandType, uint64_t,
+                          Enums::M_U64, VT_64, 1> B64;
+
+    typedef HsailDataType<SRegOperandType, int8_t, Enums::M_S8, VT_32> S8;
+    typedef HsailDataType<SRegOperandType, int16_t, Enums::M_S16, VT_32> S16;
+    typedef HsailDataType<SRegOperandType, int32_t, Enums::M_S32, VT_32> S32;
+    typedef HsailDataType<DRegOperandType, int64_t, Enums::M_S64, VT_64> S64;
+
+    typedef HsailDataType<SRegOperandType, uint8_t, Enums::M_U8, VT_32> U8;
+    typedef HsailDataType<SRegOperandType, uint16_t, Enums::M_U16, VT_32> U16;
+    typedef HsailDataType<SRegOperandType, uint32_t, Enums::M_U32, VT_32> U32;
+    typedef HsailDataType<DRegOperandType, uint64_t, Enums::M_U64, VT_64> U64;
+
+    typedef HsailDataType<SRegOperandType, float, Enums::M_F32, VT_32> F32;
+    typedef HsailDataType<DRegOperandType, double, Enums::M_F64, VT_64> F64;
+
+    template<typename DestOperandType, typename SrcOperandType,
+             int NumSrcOperands>
+    class CommonInstBase : public HsailGPUStaticInst
+    {
+      protected:
+        typename DestOperandType::DestOperand dest;
+        typename SrcOperandType::SrcOperand src[NumSrcOperands];
+
+        void
+        generateDisassembly()
+        {
+            disassembly = csprintf("%s%s %s", opcode, opcode_suffix(),
+                                   dest.disassemble());
+
+            for (int i = 0; i < NumSrcOperands; ++i) {
+                disassembly += ",";
+                disassembly += src[i].disassemble();
+            }
+        }
+
+        virtual std::string opcode_suffix() = 0;
+
+      public:
+        CommonInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                       const char *opcode)
+            : HsailGPUStaticInst(obj, opcode)
+        {
+            unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+
+            dest.init(op_offs, obj);
+
+            for (int i = 0; i < NumSrcOperands; ++i) {
+                op_offs = obj->getOperandPtr(ib->operands, i + 1);
+                src[i].init(op_offs, obj);
+            }
+        }
+
+        bool isVectorRegister(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex < NumSrcOperands)
+                return src[operandIndex].isVectorRegister();
+            else
+                return dest.isVectorRegister();
+        }
+        bool isCondRegister(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex < NumSrcOperands)
+                return src[operandIndex].isCondRegister();
+            else
+                return dest.isCondRegister();
+        }
+        bool isScalarRegister(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex < NumSrcOperands)
+                return src[operandIndex].isScalarRegister();
+            else
+                return dest.isScalarRegister();
+        }
+        bool isSrcOperand(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex < NumSrcOperands)
+                return true;
+            return false;
+        }
+
+        bool isDstOperand(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex >= NumSrcOperands)
+                return true;
+            return false;
+        }
+        int getOperandSize(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex < NumSrcOperands)
+                return src[operandIndex].opSize();
+            else
+                return dest.opSize();
+        }
+        int getRegisterIndex(int operandIndex) {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+
+            if (operandIndex < NumSrcOperands)
+                return src[operandIndex].regIndex();
+            else
+                return dest.regIndex();
+        }
+        int numSrcRegOperands() {
+            int operands = 0;
+            for (int i = 0; i < NumSrcOperands; i++) {
+                if (src[i].isVectorRegister() == true) {
+                    operands++;
+                }
+            }
+            return operands;
+        }
+        int numDstRegOperands() { return dest.isVectorRegister(); }
+        int getNumOperands() { return NumSrcOperands + 1; }
+    };
+
+    template<typename DataType, int NumSrcOperands>
+    class ArithInst : public CommonInstBase<typename DataType::OperandType,
+                                            typename DataType::OperandType,
+                                            NumSrcOperands>
+    {
+      public:
+        std::string opcode_suffix() { return csprintf("_%s", DataType::label); }
+
+        ArithInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                  const char *opcode)
+            : CommonInstBase<typename DataType::OperandType,
+                             typename DataType::OperandType,
+                             NumSrcOperands>(ib, obj, opcode)
+        {
+        }
+    };
+
+    template<typename DestOperandType, typename Src0OperandType,
+             typename Src1OperandType, typename Src2OperandType>
+    class ThreeNonUniformSourceInstBase : public HsailGPUStaticInst
+    {
+      protected:
+        typename DestOperandType::DestOperand dest;
+        typename Src0OperandType::SrcOperand  src0;
+        typename Src1OperandType::SrcOperand  src1;
+        typename Src2OperandType::SrcOperand  src2;
+
+        void
+        generateDisassembly()
+        {
+            disassembly = csprintf("%s %s,%s,%s,%s", opcode, dest.disassemble(),
+                                   src0.disassemble(), src1.disassemble(),
+                                   src2.disassemble());
+        }
+
+      public:
+        ThreeNonUniformSourceInstBase(const Brig::BrigInstBase *ib,
+                                      const BrigObject *obj,
+                                      const char *opcode)
+            : HsailGPUStaticInst(obj, opcode)
+        {
+            unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+            dest.init(op_offs, obj);
+
+            op_offs = obj->getOperandPtr(ib->operands, 1);
+            src0.init(op_offs, obj);
+
+            op_offs = obj->getOperandPtr(ib->operands, 2);
+            src1.init(op_offs, obj);
+
+            op_offs = obj->getOperandPtr(ib->operands, 3);
+            src2.init(op_offs, obj);
+        }
+
+        bool isVectorRegister(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (!operandIndex)
+                return src0.isVectorRegister();
+            else if (operandIndex == 1)
+                return src1.isVectorRegister();
+            else if (operandIndex == 2)
+                return src2.isVectorRegister();
+            else
+                return dest.isVectorRegister();
+        }
+        bool isCondRegister(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (!operandIndex)
+                return src0.isCondRegister();
+            else if (operandIndex == 1)
+                return src1.isCondRegister();
+            else if (operandIndex == 2)
+                return src2.isCondRegister();
+            else
+                return dest.isCondRegister();
+        }
+        bool isScalarRegister(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (!operandIndex)
+                return src0.isScalarRegister();
+            else if (operandIndex == 1)
+                return src1.isScalarRegister();
+            else if (operandIndex == 2)
+                return src2.isScalarRegister();
+            else
+                return dest.isScalarRegister();
+        }
+        bool isSrcOperand(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex < 3)
+                return true;
+            else
+                return false;
+        }
+        bool isDstOperand(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex >= 3)
+                return true;
+            else
+                return false;
+        }
+        int getOperandSize(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (!operandIndex)
+                return src0.opSize();
+            else if (operandIndex == 1)
+                return src1.opSize();
+            else if (operandIndex == 2)
+                return src2.opSize();
+            else
+                return dest.opSize();
+        }
+        int getRegisterIndex(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (!operandIndex)
+                return src0.regIndex();
+            else if (operandIndex == 1)
+                return src1.regIndex();
+            else if (operandIndex == 2)
+                return src2.regIndex();
+            else
+                return dest.regIndex();
+        }
+
+        int numSrcRegOperands() {
+            int operands = 0;
+            if (src0.isVectorRegister() == true) {
+                operands++;
+            }
+            if (src1.isVectorRegister() == true) {
+                operands++;
+            }
+            if (src2.isVectorRegister() == true) {
+                operands++;
+            }
+            return operands;
+        }
+        int numDstRegOperands() { return dest.isVectorRegister(); }
+        int getNumOperands() { return 4; }
+    };
+
+    template<typename DestDataType, typename Src0DataType,
+             typename Src1DataType, typename Src2DataType>
+    class ThreeNonUniformSourceInst :
+        public ThreeNonUniformSourceInstBase<typename DestDataType::OperandType,
+                                             typename Src0DataType::OperandType,
+                                             typename Src1DataType::OperandType,
+                                             typename Src2DataType::OperandType>
+    {
+      public:
+        typedef typename DestDataType::CType DestCType;
+        typedef typename Src0DataType::CType Src0CType;
+        typedef typename Src1DataType::CType Src1CType;
+        typedef typename Src2DataType::CType Src2CType;
+
+        ThreeNonUniformSourceInst(const Brig::BrigInstBase *ib,
+                                  const BrigObject *obj, const char *opcode)
+            : ThreeNonUniformSourceInstBase<typename DestDataType::OperandType,
+                                         typename Src0DataType::OperandType,
+                                         typename Src1DataType::OperandType,
+                                         typename Src2DataType::OperandType>(ib,
+                                                                    obj, opcode)
+        {
+        }
+    };
+
+    template<typename DataType>
+    class CmovInst : public ThreeNonUniformSourceInst<DataType, B1,
+                                                      DataType, DataType>
+    {
+      public:
+        CmovInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                 const char *opcode)
+            : ThreeNonUniformSourceInst<DataType, B1, DataType,
+                                        DataType>(ib, obj, opcode)
+        {
+        }
+    };
+
+    template<typename DataType>
+    class ExtractInsertInst : public ThreeNonUniformSourceInst<DataType,
+                                                               DataType, U32,
+                                                               U32>
+    {
+      public:
+        ExtractInsertInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                          const char *opcode)
+            : ThreeNonUniformSourceInst<DataType, DataType, U32,
+                                        U32>(ib, obj, opcode)
+        {
+        }
+    };
+
+    template<typename DestOperandType, typename Src0OperandType,
+             typename Src1OperandType>
+    class TwoNonUniformSourceInstBase : public HsailGPUStaticInst
+    {
+      protected:
+        typename DestOperandType::DestOperand dest;
+        typename Src0OperandType::SrcOperand src0;
+        typename Src1OperandType::SrcOperand src1;
+
+        void
+        generateDisassembly()
+        {
+            disassembly = csprintf("%s %s,%s,%s", opcode, dest.disassemble(),
+                                   src0.disassemble(), src1.disassemble());
+        }
+
+
+      public:
+        TwoNonUniformSourceInstBase(const Brig::BrigInstBase *ib,
+                                    const BrigObject *obj, const char *opcode)
+            : HsailGPUStaticInst(obj, opcode)
+        {
+            unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+            dest.init(op_offs, obj);
+
+            op_offs = obj->getOperandPtr(ib->operands, 1);
+            src0.init(op_offs, obj);
+
+            op_offs = obj->getOperandPtr(ib->operands, 2);
+            src1.init(op_offs, obj);
+        }
+        bool isVectorRegister(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (!operandIndex)
+                return src0.isVectorRegister();
+            else if (operandIndex == 1)
+                return src1.isVectorRegister();
+            else
+                return dest.isVectorRegister();
+        }
+        bool isCondRegister(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (!operandIndex)
+                return src0.isCondRegister();
+            else if (operandIndex == 1)
+                return src1.isCondRegister();
+            else
+                return dest.isCondRegister();
+        }
+        bool isScalarRegister(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (!operandIndex)
+                return src0.isScalarRegister();
+            else if (operandIndex == 1)
+                return src1.isScalarRegister();
+            else
+                return dest.isScalarRegister();
+        }
+        bool isSrcOperand(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex < 2)
+                return true;
+            else
+                return false;
+        }
+        bool isDstOperand(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex >= 2)
+                return true;
+            else
+                return false;
+        }
+        int getOperandSize(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (!operandIndex)
+                return src0.opSize();
+            else if (operandIndex == 1)
+                return src1.opSize();
+            else
+                return dest.opSize();
+        }
+        int getRegisterIndex(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (!operandIndex)
+                return src0.regIndex();
+            else if (operandIndex == 1)
+                return src1.regIndex();
+            else
+                return dest.regIndex();
+        }
+
+        int numSrcRegOperands() {
+            int operands = 0;
+            if (src0.isVectorRegister() == true) {
+                operands++;
+            }
+            if (src1.isVectorRegister() == true) {
+                operands++;
+            }
+            return operands;
+        }
+        int numDstRegOperands() { return dest.isVectorRegister(); }
+        int getNumOperands() { return 3; }
+    };
+
+    template<typename DestDataType, typename Src0DataType,
+             typename Src1DataType>
+    class TwoNonUniformSourceInst :
+        public TwoNonUniformSourceInstBase<typename DestDataType::OperandType,
+                                           typename Src0DataType::OperandType,
+                                           typename Src1DataType::OperandType>
+    {
+      public:
+        typedef typename DestDataType::CType DestCType;
+        typedef typename Src0DataType::CType Src0CType;
+        typedef typename Src1DataType::CType Src1CType;
+
+        TwoNonUniformSourceInst(const Brig::BrigInstBase *ib,
+                                const BrigObject *obj, const char *opcode)
+            : TwoNonUniformSourceInstBase<typename DestDataType::OperandType,
+                                         typename Src0DataType::OperandType,
+                                         typename Src1DataType::OperandType>(ib,
+                                                                    obj, opcode)
+        {
+        }
+    };
+
+    // helper function for ClassInst
+    template<typename T>
+    bool
+    fpclassify(T src0, uint32_t src1)
+    {
+        int fpclass = std::fpclassify(src0);
+
+        if ((src1 & 0x3) && (fpclass == FP_NAN)) {
+            return true;
+        }
+
+        if (src0 <= -0.0) {
+            if ((src1 & 0x4) && fpclass == FP_INFINITE)
+                return true;
+            if ((src1 & 0x8) && fpclass == FP_NORMAL)
+                return true;
+            if ((src1 & 0x10) && fpclass == FP_SUBNORMAL)
+                return true;
+            if ((src1 & 0x20) && fpclass == FP_ZERO)
+                return true;
+        } else {
+            if ((src1 & 0x40) && fpclass == FP_ZERO)
+                return true;
+            if ((src1 & 0x80) && fpclass == FP_SUBNORMAL)
+                return true;
+            if ((src1 & 0x100) && fpclass == FP_NORMAL)
+                return true;
+            if ((src1 & 0x200) && fpclass == FP_INFINITE)
+                return true;
+        }
+        return false;
+    }
+
+    template<typename DataType>
+    class ClassInst : public TwoNonUniformSourceInst<B1, DataType, U32>
+    {
+      public:
+        ClassInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                  const char *opcode)
+            : TwoNonUniformSourceInst<B1, DataType, U32>(ib, obj, opcode)
+        {
+        }
+    };
+
+    template<typename DataType>
+    class ShiftInst : public TwoNonUniformSourceInst<DataType, DataType, U32>
+    {
+      public:
+        ShiftInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                  const char *opcode)
+            : TwoNonUniformSourceInst<DataType, DataType, U32>(ib, obj, opcode)
+        {
+        }
+    };
+
+    // helper function for CmpInst
+    template<typename T>
+    bool
+    compare(T src0, T src1, Brig::BrigCompareOperation cmpOp)
+    {
+        using namespace Brig;
+
+        switch (cmpOp) {
+          case BRIG_COMPARE_EQ:
+          case BRIG_COMPARE_EQU:
+          case BRIG_COMPARE_SEQ:
+          case BRIG_COMPARE_SEQU:
+            return (src0 == src1);
+
+          case BRIG_COMPARE_NE:
+          case BRIG_COMPARE_NEU:
+          case BRIG_COMPARE_SNE:
+          case BRIG_COMPARE_SNEU:
+            return (src0 != src1);
+
+          case BRIG_COMPARE_LT:
+          case BRIG_COMPARE_LTU:
+          case BRIG_COMPARE_SLT:
+          case BRIG_COMPARE_SLTU:
+            return (src0 < src1);
+
+          case BRIG_COMPARE_LE:
+          case BRIG_COMPARE_LEU:
+          case BRIG_COMPARE_SLE:
+          case BRIG_COMPARE_SLEU:
+            return (src0 <= src1);
+
+          case BRIG_COMPARE_GT:
+          case BRIG_COMPARE_GTU:
+          case BRIG_COMPARE_SGT:
+          case BRIG_COMPARE_SGTU:
+            return (src0 > src1);
+
+          case BRIG_COMPARE_GE:
+          case BRIG_COMPARE_GEU:
+          case BRIG_COMPARE_SGE:
+          case BRIG_COMPARE_SGEU:
+            return (src0 >= src1);
+
+          case BRIG_COMPARE_NUM:
+          case BRIG_COMPARE_SNUM:
+            return (src0 == src0) || (src1 == src1);
+
+          case BRIG_COMPARE_NAN:
+          case BRIG_COMPARE_SNAN:
+            return (src0 != src0) || (src1 != src1);
+
+          default:
+            fatal("Bad cmpOp value %d\n", (int)cmpOp);
+        }
+    }
+
+    template<typename T>
+    int32_t
+    firstbit(T src0)
+    {
+        if (!src0)
+            return -1;
+
+        //handle positive and negative numbers
+        T tmp = (src0 < 0) ? (~src0) : (src0);
+
+        //the starting pos is MSB
+        int pos = 8 * sizeof(T) - 1;
+        int cnt = 0;
+
+        //search the first bit set to 1
+        while (!(tmp & (1 << pos))) {
+            ++cnt;
+            --pos;
+        }
+        return cnt;
+    }
+
+    const char* cmpOpToString(Brig::BrigCompareOperation cmpOp);
+
+    template<typename DestOperandType, typename SrcOperandType>
+    class CmpInstBase : public CommonInstBase<DestOperandType, SrcOperandType,
+                                              2>
+    {
+      protected:
+        Brig::BrigCompareOperation cmpOp;
+
+      public:
+        CmpInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                    const char *_opcode)
+            : CommonInstBase<DestOperandType, SrcOperandType, 2>(ib, obj,
+                                                                 _opcode)
+        {
+            assert(ib->base.kind == Brig::BRIG_KIND_INST_CMP);
+            Brig::BrigInstCmp *i = (Brig::BrigInstCmp*)ib;
+            cmpOp = (Brig::BrigCompareOperation)i->compare;
+        }
+    };
+
+    template<typename DestDataType, typename SrcDataType>
+    class CmpInst : public CmpInstBase<typename DestDataType::OperandType,
+                                       typename SrcDataType::OperandType>
+    {
+      public:
+        std::string
+        opcode_suffix()
+        {
+            return csprintf("_%s_%s_%s", cmpOpToString(this->cmpOp),
+                            DestDataType::label, SrcDataType::label);
+        }
+
+        CmpInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                const char *_opcode)
+            : CmpInstBase<typename DestDataType::OperandType,
+                          typename SrcDataType::OperandType>(ib, obj, _opcode)
+        {
+        }
+    };
+
+    template<typename DestDataType, typename SrcDataType>
+    class CvtInst : public CommonInstBase<typename DestDataType::OperandType,
+                                          typename SrcDataType::OperandType, 1>
+    {
+      public:
+        std::string opcode_suffix()
+        {
+            return csprintf("_%s_%s", DestDataType::label, SrcDataType::label);
+        }
+
+        CvtInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                const char *_opcode)
+            : CommonInstBase<typename DestDataType::OperandType,
+                             typename SrcDataType::OperandType,
+                             1>(ib, obj, _opcode)
+        {
+        }
+    };
+
+    class SpecialInstNoSrcNoDest : public HsailGPUStaticInst
+    {
+      public:
+        SpecialInstNoSrcNoDest(const Brig::BrigInstBase *ib,
+                               const BrigObject *obj, const char *_opcode)
+            : HsailGPUStaticInst(obj, _opcode)
+        {
+        }
+
+        bool isVectorRegister(int operandIndex) { return false; }
+        bool isCondRegister(int operandIndex) { return false; }
+        bool isScalarRegister(int operandIndex) { return false; }
+        bool isSrcOperand(int operandIndex) { return false; }
+        bool isDstOperand(int operandIndex) { return false; }
+        int getOperandSize(int operandIndex) { return 0; }
+        int getRegisterIndex(int operandIndex) { return -1; }
+
+        int numSrcRegOperands() { return 0; }
+        int numDstRegOperands() { return 0; }
+        int getNumOperands() { return 0; }
+    };
+
+    template<typename DestOperandType>
+    class SpecialInstNoSrcBase : public HsailGPUStaticInst
+    {
+      protected:
+        typename DestOperandType::DestOperand dest;
+
+        void generateDisassembly()
+        {
+            disassembly = csprintf("%s %s", opcode, dest.disassemble());
+        }
+
+      public:
+        SpecialInstNoSrcBase(const Brig::BrigInstBase *ib,
+                             const BrigObject *obj, const char *_opcode)
+            : HsailGPUStaticInst(obj, _opcode)
+        {
+            unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+            dest.init(op_offs, obj);
+        }
+
+        bool isVectorRegister(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return dest.isVectorRegister();
+        }
+        bool isCondRegister(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return dest.isCondRegister();
+        }
+        bool isScalarRegister(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return dest.isScalarRegister();
+        }
+        bool isSrcOperand(int operandIndex) { return false; }
+        bool isDstOperand(int operandIndex) { return true; }
+        int getOperandSize(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return dest.opSize();
+        }
+        int getRegisterIndex(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return dest.regIndex();
+        }
+        int numSrcRegOperands() { return 0; }
+        int numDstRegOperands() { return dest.isVectorRegister(); }
+        int getNumOperands() { return 1; }
+    };
+
+    template<typename DestDataType>
+    class SpecialInstNoSrc :
+        public SpecialInstNoSrcBase<typename DestDataType::OperandType>
+    {
+      public:
+        typedef typename DestDataType::CType DestCType;
+
+        SpecialInstNoSrc(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                         const char *_opcode)
+            : SpecialInstNoSrcBase<typename DestDataType::OperandType>(ib, obj,
+                                                                       _opcode)
+        {
+        }
+    };
+
+    template<typename DestOperandType>
+    class SpecialInst1SrcBase : public HsailGPUStaticInst
+    {
+      protected:
+        typedef int SrcCType;  // used in execute() template
+
+        typename DestOperandType::DestOperand dest;
+        ImmOperand<SrcCType> src0;
+
+        void
+        generateDisassembly()
+        {
+            disassembly = csprintf("%s %s,%s", opcode, dest.disassemble(),
+                                   src0.disassemble());
+        }
+
+      public:
+        SpecialInst1SrcBase(const Brig::BrigInstBase *ib,
+                            const BrigObject *obj, const char *_opcode)
+            : HsailGPUStaticInst(obj, _opcode)
+        {
+            unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+            dest.init(op_offs, obj);
+
+            op_offs = obj->getOperandPtr(ib->operands, 1);
+            src0.init(op_offs, obj);
+        }
+        bool isVectorRegister(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return dest.isVectorRegister();
+        }
+        bool isCondRegister(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return dest.isCondRegister();
+        }
+        bool isScalarRegister(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return dest.isScalarRegister();
+        }
+        bool isSrcOperand(int operandIndex) { return false; }
+        bool isDstOperand(int operandIndex) { return true; }
+        int getOperandSize(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return dest.opSize();
+        }
+        int getRegisterIndex(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return dest.regIndex();
+        }
+        int numSrcRegOperands() { return 0; }
+        int numDstRegOperands() { return dest.isVectorRegister(); }
+        int getNumOperands() { return 1; }
+    };
+
+    template<typename DestDataType>
+    class SpecialInst1Src :
+        public SpecialInst1SrcBase<typename DestDataType::OperandType>
+    {
+      public:
+        typedef typename DestDataType::CType DestCType;
+
+        SpecialInst1Src(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                        const char *_opcode)
+            : SpecialInst1SrcBase<typename DestDataType::OperandType>(ib, obj,
+                                                                      _opcode)
+        {
+        }
+    };
+
+    class Ret : public SpecialInstNoSrcNoDest
+    {
+      public:
+        typedef SpecialInstNoSrcNoDest Base;
+
+        Ret(const Brig::BrigInstBase *ib, const BrigObject *obj)
+           : Base(ib, obj, "ret")
+        {
+            o_type = Enums::OT_RET;
+        }
+
+        void execute(GPUDynInstPtr gpuDynInst);
+    };
+
+    class Barrier : public SpecialInstNoSrcNoDest
+    {
+      public:
+        typedef SpecialInstNoSrcNoDest Base;
+        uint8_t width;
+
+        Barrier(const Brig::BrigInstBase *ib, const BrigObject *obj)
+            : Base(ib, obj, "barrier")
+        {
+            o_type = Enums::OT_BARRIER;
+            assert(ib->base.kind == Brig::BRIG_KIND_INST_BR);
+            width = (uint8_t)((Brig::BrigInstBr*)ib)->width;
+        }
+
+        void execute(GPUDynInstPtr gpuDynInst);
+    };
+
+    class MemFence : public SpecialInstNoSrcNoDest
+    {
+      public:
+        typedef SpecialInstNoSrcNoDest Base;
+
+        Brig::BrigMemoryOrder memFenceMemOrder;
+        Brig::BrigMemoryScope memFenceScopeSegGroup;
+        Brig::BrigMemoryScope memFenceScopeSegGlobal;
+        Brig::BrigMemoryScope memFenceScopeSegImage;
+
+        MemFence(const Brig::BrigInstBase *ib, const BrigObject *obj)
+            : Base(ib, obj, "memfence")
+        {
+            assert(ib->base.kind == Brig::BRIG_KIND_INST_MEM_FENCE);
+
+            memFenceScopeSegGlobal = (Brig::BrigMemoryScope)
+                ((Brig::BrigInstMemFence*)ib)->globalSegmentMemoryScope;
+
+            memFenceScopeSegGroup = (Brig::BrigMemoryScope)
+                ((Brig::BrigInstMemFence*)ib)->groupSegmentMemoryScope;
+
+            memFenceScopeSegImage = (Brig::BrigMemoryScope)
+                ((Brig::BrigInstMemFence*)ib)->imageSegmentMemoryScope;
+
+            memFenceMemOrder = (Brig::BrigMemoryOrder)
+                ((Brig::BrigInstMemFence*)ib)->memoryOrder;
+
+            // set o_type based on scopes
+            if (memFenceScopeSegGlobal != Brig::BRIG_MEMORY_SCOPE_NONE &&
+                memFenceScopeSegGroup != Brig::BRIG_MEMORY_SCOPE_NONE) {
+                o_type = Enums::OT_BOTH_MEMFENCE;
+            } else if (memFenceScopeSegGlobal != Brig::BRIG_MEMORY_SCOPE_NONE) {
+                o_type = Enums::OT_GLOBAL_MEMFENCE;
+            } else if (memFenceScopeSegGroup != Brig::BRIG_MEMORY_SCOPE_NONE) {
+                o_type = Enums::OT_SHARED_MEMFENCE;
+            } else {
+                fatal("MemFence constructor: bad scope specifiers\n");
+            }
+        }
+
+        void
+        initiateAcc(GPUDynInstPtr gpuDynInst)
+        {
+            Wavefront *wave = gpuDynInst->wavefront();
+            wave->computeUnit->injectGlobalMemFence(gpuDynInst);
+        }
+
+        void
+        execute(GPUDynInstPtr gpuDynInst)
+        {
+            Wavefront *w = gpuDynInst->wavefront();
+            // 2 cases:
+            //   * memfence to a sequentially consistent memory (e.g., LDS).
+            //     These can be handled as no-ops.
+            //   * memfence to a relaxed consistency cache (e.g., Hermes, Viper,
+            //     etc.). We send a packet, tagged with the memory order and
+            //     scope, and let the GPU coalescer handle it.
+
+            if (o_type == Enums::OT_GLOBAL_MEMFENCE ||
+                o_type == Enums::OT_BOTH_MEMFENCE) {
+                gpuDynInst->simdId = w->simdId;
+                gpuDynInst->wfSlotId = w->wfSlotId;
+                gpuDynInst->wfDynId = w->wfDynId;
+                gpuDynInst->kern_id = w->kern_id;
+                gpuDynInst->cu_id = w->computeUnit->cu_id;
+
+                gpuDynInst->memoryOrder =
+                    getGenericMemoryOrder(memFenceMemOrder);
+                gpuDynInst->scope =
+                    getGenericMemoryScope(memFenceScopeSegGlobal);
+                gpuDynInst->useContinuation = false;
+                GlobalMemPipeline* gmp = &(w->computeUnit->globalMemoryPipe);
+                gmp->getGMReqFIFO().push(gpuDynInst);
+
+                w->wr_gm_reqs_in_pipe--;
+                w->rd_gm_reqs_in_pipe--;
+                w->mem_reqs_in_pipe--;
+                w->outstanding_reqs++;
+            } else if (o_type == Enums::OT_SHARED_MEMFENCE) {
+                // no-op
+            } else {
+                fatal("MemFence execute: bad o_type\n");
+            }
+        }
+    };
+
+    class Call : public HsailGPUStaticInst
+    {
+      public:
+        // private helper functions
+        void calcAddr(Wavefront* w, GPUDynInstPtr m);
+
+        void
+        generateDisassembly()
+        {
+            if (dest.disassemble() == "") {
+                disassembly = csprintf("%s %s (%s)", opcode, src0.disassemble(),
+                                       src1.disassemble());
+            } else {
+                disassembly = csprintf("%s %s (%s) (%s)", opcode,
+                                       src0.disassemble(), dest.disassemble(),
+                                       src1.disassemble());
+            }
+        }
+
+        bool
+        isPseudoOp()
+        {
+            std::string func_name = src0.disassemble();
+            if (func_name.find("__gem5_hsail_op") != std::string::npos) {
+                return true;
+            }
+            return false;
+        }
+
+        // member variables
+        ListOperand dest;
+        FunctionRefOperand src0;
+        ListOperand src1;
+        HsailCode *func_ptr;
+
+        // exec function for pseudo instructions mapped on top of call opcode
+        void execPseudoInst(Wavefront *w, GPUDynInstPtr gpuDynInst);
+
+        // user-defined pseudo instructions
+        void MagicPrintLane(Wavefront *w);
+        void MagicPrintLane64(Wavefront *w);
+        void MagicPrintWF32(Wavefront *w);
+        void MagicPrintWF64(Wavefront *w);
+        void MagicPrintWFFloat(Wavefront *w);
+        void MagicSimBreak(Wavefront *w);
+        void MagicPrefixSum(Wavefront *w);
+        void MagicReduction(Wavefront *w);
+        void MagicMaskLower(Wavefront *w);
+        void MagicMaskUpper(Wavefront *w);
+        void MagicJoinWFBar(Wavefront *w);
+        void MagicWaitWFBar(Wavefront *w);
+        void MagicPanic(Wavefront *w);
+
+        void MagicAtomicNRAddGlobalU32Reg(Wavefront *w,
+                                          GPUDynInstPtr gpuDynInst);
+
+        void MagicAtomicNRAddGroupU32Reg(Wavefront *w,
+                                         GPUDynInstPtr gpuDynInst);
+
+        void MagicLoadGlobalU32Reg(Wavefront *w, GPUDynInstPtr gpuDynInst);
+
+        void MagicXactCasLd(Wavefront *w);
+        void MagicMostSigThread(Wavefront *w);
+        void MagicMostSigBroadcast(Wavefront *w);
+
+        void MagicPrintWF32ID(Wavefront *w);
+        void MagicPrintWFID64(Wavefront *w);
+
+        Call(const Brig::BrigInstBase *ib, const BrigObject *obj)
+            : HsailGPUStaticInst(obj, "call")
+        {
+            unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+            dest.init(op_offs, obj);
+            op_offs = obj->getOperandPtr(ib->operands, 1);
+            src0.init(op_offs, obj);
+
+            func_ptr = nullptr;
+            std::string func_name = src0.disassemble();
+            if (!isPseudoOp()) {
+                func_ptr = dynamic_cast<HsailCode*>(obj->
+                                                    getFunction(func_name));
+
+                if (!func_ptr)
+                    fatal("call::exec cannot find function: %s\n", func_name);
+            }
+
+            op_offs = obj->getOperandPtr(ib->operands, 2);
+            src1.init(op_offs, obj);
+        }
+
+        bool isVectorRegister(int operandIndex) { return false; }
+        bool isCondRegister(int operandIndex) { return false; }
+        bool isScalarRegister(int operandIndex) { return false; }
+        bool isSrcOperand(int operandIndex) { return false; }
+        bool isDstOperand(int operandIndex) { return false; }
+        int  getOperandSize(int operandIndex) { return 0; }
+        int  getRegisterIndex(int operandIndex) { return -1; }
+
+        void
+        execute(GPUDynInstPtr gpuDynInst)
+        {
+            Wavefront *w = gpuDynInst->wavefront();
+
+            std::string func_name = src0.disassemble();
+            if (isPseudoOp()) {
+                execPseudoInst(w, gpuDynInst);
+            } else {
+                fatal("Native HSAIL functions are not yet implemented: %s\n",
+                      func_name);
+            }
+        }
+        int numSrcRegOperands() { return 0; }
+        int numDstRegOperands() { return 0; }
+        int getNumOperands() { return 2; }
+    };
+
+    template<typename T> T heynot(T arg) { return ~arg; }
+    template<> inline bool heynot<bool>(bool arg) { return !arg; }
+} // namespace HsailISA
+
+#endif // __ARCH_HSAIL_INSTS_DECL_HH__
diff --git a/src/arch/hsail/insts/gpu_static_inst.cc b/src/arch/hsail/insts/gpu_static_inst.cc
new file mode 100644
index 000000000..bbaeb13e6
--- /dev/null
+++ b/src/arch/hsail/insts/gpu_static_inst.cc
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#include "arch/hsail/insts/gpu_static_inst.hh"
+
+#include "gpu-compute/brig_object.hh"
+
+namespace HsailISA
+{
+    HsailGPUStaticInst::HsailGPUStaticInst(const BrigObject *obj,
+                                           const std::string &opcode)
+        : GPUStaticInst(opcode), hsailCode(obj->currentCode)
+    {
+    }
+
+    void
+    HsailGPUStaticInst::generateDisassembly()
+    {
+        disassembly = opcode;
+    }
+
+    const std::string&
+    HsailGPUStaticInst::disassemble()
+    {
+        if (disassembly.empty()) {
+            generateDisassembly();
+            assert(!disassembly.empty());
+        }
+
+        return disassembly;
+    }
+} // namespace HsailISA
diff --git a/src/arch/hsail/insts/gpu_static_inst.hh b/src/arch/hsail/insts/gpu_static_inst.hh
new file mode 100644
index 000000000..29aab1f70
--- /dev/null
+++ b/src/arch/hsail/insts/gpu_static_inst.hh
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#ifndef __ARCH_HSAIL_INSTS_GPU_STATIC_INST_HH__
+#define __ARCH_HSAIL_INSTS_GPU_STATIC_INST_HH__
+
+/*
+ * @file gpu_static_inst.hh
+ *
+ * Defines the base class representing HSAIL GPU static instructions.
+ */
+
+#include "gpu-compute/gpu_static_inst.hh"
+
+class BrigObject;
+class HsailCode;
+
+namespace HsailISA
+{
+    class HsailGPUStaticInst : public GPUStaticInst
+    {
+      public:
+        HsailGPUStaticInst(const BrigObject *obj, const std::string &opcode);
+        void generateDisassembly();
+        const std::string &disassemble();
+        uint32_t instSize() { return 4; }
+
+      protected:
+        HsailCode *hsailCode;
+    };
+} // namespace HsailISA
+
+#endif // __ARCH_HSAIL_INSTS_GPU_STATIC_INST_HH__
diff --git a/src/arch/hsail/insts/main.cc b/src/arch/hsail/insts/main.cc
new file mode 100644
index 000000000..4e70bf46a
--- /dev/null
+++ b/src/arch/hsail/insts/main.cc
@@ -0,0 +1,208 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#include "arch/hsail/insts/decl.hh"
+#include "debug/GPUExec.hh"
+#include "gpu-compute/dispatcher.hh"
+#include "gpu-compute/simple_pool_manager.hh"
+
+namespace HsailISA
+{
+    template<> const char *B1::label = "b1";
+    template<> const char *B8::label = "b8";
+    template<> const char *B16::label = "b16";
+    template<> const char *B32::label = "b32";
+    template<> const char *B64::label = "b64";
+
+    template<> const char *S8::label = "s8";
+    template<> const char *S16::label = "s16";
+    template<> const char *S32::label = "s32";
+    template<> const char *S64::label = "s64";
+
+    template<> const char *U8::label = "u8";
+    template<> const char *U16::label = "u16";
+    template<> const char *U32::label = "u32";
+    template<> const char *U64::label = "u64";
+
+    template<> const char *F32::label = "f32";
+    template<> const char *F64::label = "f64";
+
+    const char*
+    cmpOpToString(Brig::BrigCompareOperation cmpOp)
+    {
+        using namespace Brig;
+
+        switch (cmpOp) {
+          case BRIG_COMPARE_EQ:
+            return "eq";
+          case BRIG_COMPARE_NE:
+            return "ne";
+          case BRIG_COMPARE_LT:
+            return "lt";
+          case BRIG_COMPARE_LE:
+            return "le";
+          case BRIG_COMPARE_GT:
+            return "gt";
+          case BRIG_COMPARE_GE:
+            return "ge";
+          case BRIG_COMPARE_EQU:
+            return "equ";
+          case BRIG_COMPARE_NEU:
+            return "neu";
+          case BRIG_COMPARE_LTU:
+            return "ltu";
+          case BRIG_COMPARE_LEU:
+            return "leu";
+          case BRIG_COMPARE_GTU:
+            return "gtu";
+          case BRIG_COMPARE_GEU:
+            return "geu";
+          case BRIG_COMPARE_NUM:
+            return "num";
+          case BRIG_COMPARE_NAN:
+            return "nan";
+          case BRIG_COMPARE_SEQ:
+            return "seq";
+          case BRIG_COMPARE_SNE:
+            return "sne";
+          case BRIG_COMPARE_SLT:
+            return "slt";
+          case BRIG_COMPARE_SLE:
+            return "sle";
+          case BRIG_COMPARE_SGT:
+            return "sgt";
+          case BRIG_COMPARE_SGE:
+            return "sge";
+          case BRIG_COMPARE_SGEU:
+            return "sgeu";
+          case BRIG_COMPARE_SEQU:
+            return "sequ";
+          case BRIG_COMPARE_SNEU:
+            return "sneu";
+          case BRIG_COMPARE_SLTU:
+            return "sltu";
+          case BRIG_COMPARE_SLEU:
+            return "sleu";
+          case BRIG_COMPARE_SNUM:
+            return "snum";
+          case BRIG_COMPARE_SNAN:
+            return "snan";
+          case BRIG_COMPARE_SGTU:
+            return "sgtu";
+          default:
+            return "unknown";
+        }
+    }
+
+    void
+    Ret::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *w = gpuDynInst->wavefront();
+
+        const VectorMask &mask = w->get_pred();
+
+        // mask off completed work-items
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (mask[lane]) {
+                w->init_mask[lane] = 0;
+            }
+
+        }
+
+        // delete extra instructions fetched for completed work-items
+        w->instructionBuffer.erase(w->instructionBuffer.begin() + 1,
+                                   w->instructionBuffer.end());
+        if (w->pendingFetch) {
+            w->dropFetch = true;
+        }
+
+        // if all work-items have completed, then wave-front is done
+        if (w->init_mask.none()) {
+            w->status = Wavefront::S_STOPPED;
+
+            int32_t refCount = w->computeUnit->getLds().
+                                   decreaseRefCounter(w->dispatchid, w->wg_id);
+
+            DPRINTF(GPUExec, "CU%d: decrease ref ctr WG[%d] to [%d]\n",
+                            w->computeUnit->cu_id, w->wg_id, refCount);
+
+            // free the vector registers of the completed wavefront
+            w->computeUnit->vectorRegsReserved[w->simdId] -=
+                w->reservedVectorRegs;
+
+            assert(w->computeUnit->vectorRegsReserved[w->simdId] >= 0);
+
+            uint32_t endIndex = (w->startVgprIndex +
+                                 w->reservedVectorRegs - 1) %
+                w->computeUnit->vrf[w->simdId]->numRegs();
+
+            w->computeUnit->vrf[w->simdId]->manager->
+                freeRegion(w->startVgprIndex, endIndex);
+
+            w->reservedVectorRegs = 0;
+            w->startVgprIndex = 0;
+            w->computeUnit->completedWfs++;
+
+            DPRINTF(GPUExec, "Doing return for CU%d: WF[%d][%d][%d]\n",
+                    w->computeUnit->cu_id, w->simdId, w->wfSlotId, w->wfDynId);
+
+            if (!refCount) {
+                // Notify Memory System of Kernel Completion
+                // Kernel End = isKernel + isRelease
+                w->status = Wavefront::S_RETURNING;
+                GPUDynInstPtr local_mempacket = gpuDynInst;
+                local_mempacket->memoryOrder = Enums::MEMORY_ORDER_SC_RELEASE;
+                local_mempacket->scope = Enums::MEMORY_SCOPE_SYSTEM;
+                local_mempacket->useContinuation = false;
+                local_mempacket->simdId = w->simdId;
+                local_mempacket->wfSlotId = w->wfSlotId;
+                local_mempacket->wfDynId = w->wfDynId;
+                w->computeUnit->injectGlobalMemFence(local_mempacket, true);
+            } else {
+                w->computeUnit->shader->dispatcher->scheduleDispatch();
+            }
+        }
+    }
+
+    void
+    Barrier::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *w = gpuDynInst->wavefront();
+
+        assert(w->barrier_cnt == w->old_barrier_cnt);
+        w->barrier_cnt = w->old_barrier_cnt + 1;
+        w->stalledAtBarrier = true;
+    }
+} // namespace HsailISA
diff --git a/src/arch/hsail/insts/mem.cc b/src/arch/hsail/insts/mem.cc
new file mode 100644
index 000000000..97d4c902b
--- /dev/null
+++ b/src/arch/hsail/insts/mem.cc
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#include "arch/hsail/insts/mem.hh"
+
+#include "arch/hsail/Brig.h"
+#include "enums/OpType.hh"
+
+using namespace Brig;
+
+namespace HsailISA
+{
+    const char* atomicOpToString(BrigAtomicOperation brigOp);
+
+    Enums::MemOpType
+    brigAtomicToMemOpType(BrigOpcode brigOpCode, BrigAtomicOperation brigOp)
+    {
+        if (brigOpCode == Brig::BRIG_OPCODE_ATOMIC) {
+            switch (brigOp) {
+              case BRIG_ATOMIC_AND:
+                return Enums::MO_AAND;
+              case BRIG_ATOMIC_OR:
+                return Enums::MO_AOR;
+              case BRIG_ATOMIC_XOR:
+                return Enums::MO_AXOR;
+              case BRIG_ATOMIC_CAS:
+                return Enums::MO_ACAS;
+              case BRIG_ATOMIC_EXCH:
+                return Enums::MO_AEXCH;
+              case BRIG_ATOMIC_ADD:
+                return Enums::MO_AADD;
+              case BRIG_ATOMIC_WRAPINC:
+                return Enums::MO_AINC;
+              case BRIG_ATOMIC_WRAPDEC:
+                return Enums::MO_ADEC;
+              case BRIG_ATOMIC_MIN:
+                return Enums::MO_AMIN;
+              case BRIG_ATOMIC_MAX:
+                return Enums::MO_AMAX;
+              case BRIG_ATOMIC_SUB:
+                return Enums::MO_ASUB;
+              default:
+                fatal("Bad BrigAtomicOperation code %d\n", brigOp);
+            }
+        } else if (brigOpCode == Brig::BRIG_OPCODE_ATOMICNORET) {
+            switch (brigOp) {
+              case BRIG_ATOMIC_AND:
+                  return Enums::MO_ANRAND;
+              case BRIG_ATOMIC_OR:
+                  return Enums::MO_ANROR;
+              case BRIG_ATOMIC_XOR:
+                  return Enums::MO_ANRXOR;
+              case BRIG_ATOMIC_CAS:
+                  return Enums::MO_ANRCAS;
+              case BRIG_ATOMIC_EXCH:
+                  return Enums::MO_ANREXCH;
+              case BRIG_ATOMIC_ADD:
+                  return Enums::MO_ANRADD;
+              case BRIG_ATOMIC_WRAPINC:
+                  return Enums::MO_ANRINC;
+              case BRIG_ATOMIC_WRAPDEC:
+                  return Enums::MO_ANRDEC;
+              case BRIG_ATOMIC_MIN:
+                  return Enums::MO_ANRMIN;
+              case BRIG_ATOMIC_MAX:
+                  return Enums::MO_ANRMAX;
+              case BRIG_ATOMIC_SUB:
+                  return Enums::MO_ANRSUB;
+              default:
+                fatal("Bad BrigAtomicOperation code %d\n", brigOp);
+            }
+        } else {
+            fatal("Bad BrigAtomicOpcode %d\n", brigOpCode);
+        }
+    }
+
+    const char*
+    atomicOpToString(BrigAtomicOperation brigOp)
+    {
+        switch (brigOp) {
+          case BRIG_ATOMIC_AND:
+            return "and";
+          case BRIG_ATOMIC_OR:
+            return "or";
+          case BRIG_ATOMIC_XOR:
+            return "xor";
+          case BRIG_ATOMIC_CAS:
+            return "cas";
+          case BRIG_ATOMIC_EXCH:
+            return "exch";
+          case BRIG_ATOMIC_ADD:
+            return "add";
+          case BRIG_ATOMIC_WRAPINC:
+            return "inc";
+          case BRIG_ATOMIC_WRAPDEC:
+            return "dec";
+          case BRIG_ATOMIC_MIN:
+            return "min";
+          case BRIG_ATOMIC_MAX:
+            return "max";
+          case BRIG_ATOMIC_SUB:
+            return "sub";
+          default:
+            return "unknown";
+        }
+    }
+} // namespace HsailISA
diff --git a/src/arch/hsail/insts/mem.hh b/src/arch/hsail/insts/mem.hh
new file mode 100644
index 000000000..d3ce76dee
--- /dev/null
+++ b/src/arch/hsail/insts/mem.hh
@@ -0,0 +1,1629 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#ifndef __ARCH_HSAIL_INSTS_MEM_HH__
+#define __ARCH_HSAIL_INSTS_MEM_HH__
+
+#include "arch/hsail/insts/decl.hh"
+#include "arch/hsail/insts/gpu_static_inst.hh"
+#include "arch/hsail/operand.hh"
+
+namespace HsailISA
+{
+    class MemInst
+    {
+      public:
+        MemInst() : size(0), addr_operand(nullptr) { }
+
+        MemInst(Enums::MemType m_type)
+        {
+            if (m_type == Enums::M_U64 ||
+                m_type == Enums::M_S64 ||
+                m_type == Enums::M_F64) {
+                size = 8;
+            } else if (m_type == Enums::M_U32 ||
+                       m_type == Enums::M_S32 ||
+                       m_type == Enums::M_F32) {
+                size = 4;
+            } else if (m_type == Enums::M_U16 ||
+                       m_type == Enums::M_S16 ||
+                       m_type == Enums::M_F16) {
+                size = 2;
+            } else {
+                size = 1;
+            }
+
+            addr_operand = nullptr;
+        }
+
+        void
+        init_addr(AddrOperandBase *_addr_operand)
+        {
+            addr_operand = _addr_operand;
+        }
+
+      private:
+        int size;
+        AddrOperandBase *addr_operand;
+
+      public:
+        int getMemOperandSize() { return size; }
+        AddrOperandBase *getAddressOperand() { return addr_operand; }
+    };
+
+    template<typename DestOperandType, typename AddrOperandType>
+    class LdaInstBase : public HsailGPUStaticInst
+    {
+      public:
+        typename DestOperandType::DestOperand dest;
+        AddrOperandType addr;
+
+        LdaInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                    const char *_opcode)
+           : HsailGPUStaticInst(obj, _opcode)
+        {
+            using namespace Brig;
+
+            unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+            dest.init(op_offs, obj);
+            op_offs = obj->getOperandPtr(ib->operands, 1);
+            addr.init(op_offs, obj);
+        }
+
+        int numSrcRegOperands() { return(this->addr.isVectorRegister()); }
+        int numDstRegOperands() { return dest.isVectorRegister(); }
+        bool isVectorRegister(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return((operandIndex == 0) ? dest.isVectorRegister() :
+                   this->addr.isVectorRegister());
+        }
+        bool isCondRegister(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return((operandIndex == 0) ? dest.isCondRegister() :
+                   this->addr.isCondRegister());
+        }
+        bool isScalarRegister(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return((operandIndex == 0) ? dest.isScalarRegister() :
+                   this->addr.isScalarRegister());
+        }
+        bool isSrcOperand(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex > 0)
+                return(this->addr.isVectorRegister());
+            return false;
+        }
+        bool isDstOperand(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return(operandIndex == 0);
+        }
+        int getOperandSize(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return((operandIndex == 0) ? dest.opSize() :
+                   this->addr.opSize());
+        }
+        int getRegisterIndex(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return((operandIndex == 0) ? dest.regIndex() :
+                   this->addr.regIndex());
+        }
+        int getNumOperands()
+        {
+            if (this->addr.isVectorRegister())
+                return 2;
+            return 1;
+        }
+    };
+
+    template<typename DestDataType, typename AddrOperandType>
+    class LdaInst :
+        public LdaInstBase<typename DestDataType::OperandType, AddrOperandType>,
+        public MemInst
+    {
+      public:
+        void generateDisassembly();
+
+        LdaInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                        const char *_opcode)
+            : LdaInstBase<typename DestDataType::OperandType,
+                          AddrOperandType>(ib, obj, _opcode)
+        {
+            init_addr(&this->addr);
+        }
+
+        void execute(GPUDynInstPtr gpuDynInst);
+    };
+
+    template<typename DataType>
+    GPUStaticInst*
+    decodeLda(const Brig::BrigInstBase *ib, const BrigObject *obj)
+    {
+        unsigned op_offs = obj->getOperandPtr(ib->operands, 1);
+        BrigRegOperandInfo regDataType = findRegDataType(op_offs, obj);
+
+        if (regDataType.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) {
+            return new LdaInst<DataType, NoRegAddrOperand>(ib, obj, "ldas");
+        } else if (regDataType.kind == Brig::BRIG_KIND_OPERAND_REGISTER) {
+            // V2/V4 not allowed
+            switch (regDataType.regKind) {
+              case Brig::BRIG_REGISTER_KIND_SINGLE:
+                return new LdaInst<DataType, SRegAddrOperand>(ib, obj, "ldas");
+              case Brig::BRIG_REGISTER_KIND_DOUBLE:
+                return new LdaInst<DataType, DRegAddrOperand>(ib, obj, "ldas");
+              default:
+                fatal("Bad ldas register operand type %d\n", regDataType.type);
+            }
+        } else {
+            fatal("Bad ldas register operand kind %d\n", regDataType.kind);
+        }
+    }
+
+    template<typename MemOperandType, typename DestOperandType,
+             typename AddrOperandType>
+    class LdInstBase : public HsailGPUStaticInst
+    {
+      public:
+        Brig::BrigWidth8_t width;
+        typename DestOperandType::DestOperand dest;
+        AddrOperandType addr;
+
+        Brig::BrigSegment segment;
+        Brig::BrigMemoryOrder memoryOrder;
+        Brig::BrigMemoryScope memoryScope;
+        unsigned int equivClass;
+        bool isArgLoad()
+        {
+            return segment == Brig::BRIG_SEGMENT_KERNARG ||
+                   segment == Brig::BRIG_SEGMENT_ARG;
+        }
+        void
+        initLd(const Brig::BrigInstBase *ib, const BrigObject *obj,
+               const char *_opcode)
+        {
+            using namespace Brig;
+
+            const BrigInstMem *ldst = (const BrigInstMem*)ib;
+
+            segment = (BrigSegment)ldst->segment;
+            memoryOrder = BRIG_MEMORY_ORDER_NONE;
+            memoryScope = BRIG_MEMORY_SCOPE_NONE;
+            equivClass = ldst->equivClass;
+
+            switch (segment) {
+              case BRIG_SEGMENT_GLOBAL:
+                o_type = Enums::OT_GLOBAL_READ;
+                break;
+
+              case BRIG_SEGMENT_GROUP:
+                o_type = Enums::OT_SHARED_READ;
+                break;
+
+              case BRIG_SEGMENT_PRIVATE:
+                o_type = Enums::OT_PRIVATE_READ;
+                break;
+
+              case BRIG_SEGMENT_READONLY:
+                o_type = Enums::OT_READONLY_READ;
+                break;
+
+              case BRIG_SEGMENT_SPILL:
+                o_type = Enums::OT_SPILL_READ;
+                break;
+
+              case BRIG_SEGMENT_FLAT:
+                o_type = Enums::OT_FLAT_READ;
+                break;
+
+              case BRIG_SEGMENT_KERNARG:
+                o_type = Enums::OT_KERN_READ;
+                break;
+
+              case BRIG_SEGMENT_ARG:
+                o_type = Enums::OT_ARG;
+                break;
+
+              default:
+                panic("Ld: segment %d not supported\n", segment);
+            }
+
+            width = ldst->width;
+            unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+            const Brig::BrigOperand *brigOp = obj->getOperand(op_offs);
+            if (brigOp->kind == BRIG_KIND_OPERAND_REGISTER)
+                dest.init(op_offs, obj);
+
+            op_offs = obj->getOperandPtr(ib->operands, 1);
+            addr.init(op_offs, obj);
+        }
+
+        void
+        initAtomicLd(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                     const char *_opcode)
+        {
+            using namespace Brig;
+
+            const BrigInstAtomic *at = (const BrigInstAtomic*)ib;
+
+            segment = (BrigSegment)at->segment;
+            memoryOrder = (BrigMemoryOrder)at->memoryOrder;
+            memoryScope = (BrigMemoryScope)at->memoryScope;
+            equivClass = 0;
+
+            switch (segment) {
+              case BRIG_SEGMENT_GLOBAL:
+                o_type = Enums::OT_GLOBAL_READ;
+                break;
+
+              case BRIG_SEGMENT_GROUP:
+                o_type = Enums::OT_SHARED_READ;
+                break;
+
+              case BRIG_SEGMENT_PRIVATE:
+                o_type = Enums::OT_PRIVATE_READ;
+                break;
+
+              case BRIG_SEGMENT_READONLY:
+                o_type = Enums::OT_READONLY_READ;
+                break;
+
+              case BRIG_SEGMENT_SPILL:
+                o_type = Enums::OT_SPILL_READ;
+                break;
+
+              case BRIG_SEGMENT_FLAT:
+                o_type = Enums::OT_FLAT_READ;
+                break;
+
+              case BRIG_SEGMENT_KERNARG:
+                o_type = Enums::OT_KERN_READ;
+                break;
+
+              case BRIG_SEGMENT_ARG:
+                o_type = Enums::OT_ARG;
+                break;
+
+              default:
+                panic("Ld: segment %d not supported\n", segment);
+            }
+
+            width = BRIG_WIDTH_1;
+            unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+            const Brig::BrigOperand *brigOp = obj->getOperand(op_offs);
+
+            if (brigOp->kind == BRIG_KIND_OPERAND_REGISTER)
+                dest.init(op_offs, obj);
+
+            op_offs = obj->getOperandPtr(ib->operands,1);
+            addr.init(op_offs, obj);
+        }
+
+        LdInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                   const char *_opcode)
+           : HsailGPUStaticInst(obj, _opcode)
+        {
+            using namespace Brig;
+
+            if (ib->opcode == BRIG_OPCODE_LD) {
+                initLd(ib, obj, _opcode);
+            } else {
+                initAtomicLd(ib, obj, _opcode);
+            }
+        }
+
+        int numSrcRegOperands() { return(this->addr.isVectorRegister()); }
+        int numDstRegOperands() { return dest.isVectorRegister(); }
+        int getNumOperands()
+        {
+            if (this->addr.isVectorRegister())
+                return 2;
+            else
+                return 1;
+        }
+        bool isVectorRegister(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return((operandIndex == 0) ? dest.isVectorRegister() :
+                   this->addr.isVectorRegister());
+        }
+        bool isCondRegister(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return((operandIndex == 0) ? dest.isCondRegister() :
+                   this->addr.isCondRegister());
+        }
+        bool isScalarRegister(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return((operandIndex == 0) ? dest.isScalarRegister() :
+                   this->addr.isScalarRegister());
+        }
+        bool isSrcOperand(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex > 0)
+                return(this->addr.isVectorRegister());
+            return false;
+        }
+        bool isDstOperand(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return(operandIndex == 0);
+        }
+        int getOperandSize(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return((operandIndex == 0) ? dest.opSize() :
+                   this->addr.opSize());
+        }
+        int getRegisterIndex(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return((operandIndex == 0) ? dest.regIndex() :
+                   this->addr.regIndex());
+        }
+    };
+
+    template<typename MemDataType, typename DestDataType,
+             typename AddrOperandType>
+    class LdInst :
+        public LdInstBase<typename MemDataType::CType,
+                          typename DestDataType::OperandType, AddrOperandType>,
+        public MemInst
+    {
+        typename DestDataType::OperandType::DestOperand dest_vect[4];
+        uint16_t num_dest_operands;
+        void generateDisassembly();
+
+      public:
+        LdInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
+               const char *_opcode)
+            : LdInstBase<typename MemDataType::CType,
+                         typename DestDataType::OperandType,
+                         AddrOperandType>(ib, obj, _opcode),
+              MemInst(MemDataType::memType)
+        {
+            init_addr(&this->addr);
+
+            unsigned op_offs = obj->getOperandPtr(ib->operands,0);
+            const Brig::BrigOperand *brigOp = obj->getOperand(op_offs);
+
+            if (brigOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST) {
+                const Brig::BrigOperandOperandList *brigRegVecOp =
+                    (const Brig::BrigOperandOperandList*)brigOp;
+
+                num_dest_operands =
+                    *((unsigned*)obj->getData(brigRegVecOp->elements)) / 4;
+
+                assert(num_dest_operands <= 4);
+            } else {
+                num_dest_operands = 1;
+            }
+
+            if (num_dest_operands > 1) {
+                assert(brigOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST);
+
+                for (int i = 0; i < num_dest_operands; ++i) {
+                    dest_vect[i].init_from_vect(op_offs, obj, i);
+                }
+            }
+        }
+
+        void
+        initiateAcc(GPUDynInstPtr gpuDynInst) override
+        {
+            typedef typename MemDataType::CType c0;
+
+            gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
+
+            if (num_dest_operands > 1) {
+                for (int i = 0; i < VSZ; ++i)
+                    if (gpuDynInst->exec_mask[i])
+                        gpuDynInst->statusVector.push_back(num_dest_operands);
+                    else
+                        gpuDynInst->statusVector.push_back(0);
+            }
+
+            for (int k = 0; k < num_dest_operands; ++k) {
+
+                c0 *d = &((c0*)gpuDynInst->d_data)[k * VSZ];
+
+                for (int i = 0; i < VSZ; ++i) {
+                    if (gpuDynInst->exec_mask[i]) {
+                        Addr vaddr = gpuDynInst->addr[i] + k * sizeof(c0);
+
+                        if (isLocalMem()) {
+                            // load from shared memory
+                            *d = gpuDynInst->wavefront()->ldsChunk->
+                                read<c0>(vaddr);
+                        } else {
+                            Request *req = new Request(0, vaddr, sizeof(c0), 0,
+                                          gpuDynInst->computeUnit()->masterId(),
+                                          0, gpuDynInst->wfDynId, i);
+
+                            gpuDynInst->setRequestFlags(req);
+                            PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
+                            pkt->dataStatic(d);
+
+                            if (gpuDynInst->computeUnit()->shader->
+                                separate_acquire_release &&
+                                gpuDynInst->memoryOrder ==
+                                Enums::MEMORY_ORDER_SC_ACQUIRE) {
+                                // if this load has acquire semantics,
+                                // set the response continuation function
+                                // to perform an Acquire request
+                                gpuDynInst->execContinuation =
+                                    &GPUStaticInst::execLdAcq;
+
+                                gpuDynInst->useContinuation = true;
+                            } else {
+                                // the request will be finished when
+                                // the load completes
+                                gpuDynInst->useContinuation = false;
+                            }
+                            // translation is performed in sendRequest()
+                            gpuDynInst->computeUnit()->sendRequest(gpuDynInst,
+                                                                   i, pkt);
+                        }
+                    }
+                    ++d;
+                }
+            }
+
+            gpuDynInst->updateStats();
+        }
+
+      private:
+        void
+        execLdAcq(GPUDynInstPtr gpuDynInst) override
+        {
+            // after the load has complete and if the load has acquire
+            // semantics, issue an acquire request.
+            if (!isLocalMem()) {
+                if (gpuDynInst->computeUnit()->shader->separate_acquire_release
+                    && gpuDynInst->memoryOrder ==
+                    Enums::MEMORY_ORDER_SC_ACQUIRE) {
+                    gpuDynInst->statusBitVector = VectorMask(1);
+                    gpuDynInst->useContinuation = false;
+                    // create request
+                    Request *req = new Request(0, 0, 0, 0,
+                                  gpuDynInst->computeUnit()->masterId(),
+                                  0, gpuDynInst->wfDynId, -1);
+                    req->setFlags(Request::ACQUIRE);
+                    gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req);
+                }
+            }
+        }
+
+      public:
+        bool
+        isLocalMem() const override
+        {
+            return this->segment == Brig::BRIG_SEGMENT_GROUP;
+        }
+
+        bool isVectorRegister(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if ((num_dest_operands != getNumOperands()) &&
+                (operandIndex == (getNumOperands()-1)))
+                return(this->addr.isVectorRegister());
+            if (num_dest_operands > 1) {
+                return dest_vect[operandIndex].isVectorRegister();
+            }
+            else if (num_dest_operands == 1) {
+                return LdInstBase<typename MemDataType::CType,
+                       typename DestDataType::OperandType,
+                       AddrOperandType>::dest.isVectorRegister();
+            }
+            return false;
+        }
+        bool isCondRegister(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if ((num_dest_operands != getNumOperands()) &&
+                (operandIndex == (getNumOperands()-1)))
+                return(this->addr.isCondRegister());
+            if (num_dest_operands > 1)
+                return dest_vect[operandIndex].isCondRegister();
+            else if (num_dest_operands == 1)
+                return LdInstBase<typename MemDataType::CType,
+                       typename DestDataType::OperandType,
+                       AddrOperandType>::dest.isCondRegister();
+            return false;
+        }
+        bool isScalarRegister(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if ((num_dest_operands != getNumOperands()) &&
+                (operandIndex == (getNumOperands()-1)))
+                return(this->addr.isScalarRegister());
+            if (num_dest_operands > 1)
+                return dest_vect[operandIndex].isScalarRegister();
+            else if (num_dest_operands == 1)
+                return LdInstBase<typename MemDataType::CType,
+                       typename DestDataType::OperandType,
+                       AddrOperandType>::dest.isScalarRegister();
+            return false;
+        }
+        bool isSrcOperand(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if ((num_dest_operands != getNumOperands()) &&
+                (operandIndex == (getNumOperands()-1)))
+                return(this->addr.isVectorRegister());
+            return false;
+        }
+        bool isDstOperand(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if ((num_dest_operands != getNumOperands()) &&
+                (operandIndex == (getNumOperands()-1)))
+                return false;
+            return true;
+        }
+        int getOperandSize(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if ((num_dest_operands != getNumOperands()) &&
+                (operandIndex == (getNumOperands()-1)))
+                return(this->addr.opSize());
+            if (num_dest_operands > 1)
+                return(dest_vect[operandIndex].opSize());
+            else if (num_dest_operands == 1)
+                return(LdInstBase<typename MemDataType::CType,
+                       typename DestDataType::OperandType,
+                       AddrOperandType>::dest.opSize());
+            return 0;
+        }
+        int getRegisterIndex(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if ((num_dest_operands != getNumOperands()) &&
+                (operandIndex == (getNumOperands()-1)))
+                return(this->addr.regIndex());
+            if (num_dest_operands > 1)
+                return(dest_vect[operandIndex].regIndex());
+            else if (num_dest_operands == 1)
+                return(LdInstBase<typename MemDataType::CType,
+                       typename DestDataType::OperandType,
+                       AddrOperandType>::dest.regIndex());
+            return -1;
+        }
+        int getNumOperands()
+        {
+            if (this->addr.isVectorRegister() || this->addr.isScalarRegister())
+                return(num_dest_operands+1);
+            else
+                return(num_dest_operands);
+        }
+        void execute(GPUDynInstPtr gpuDynInst);
+    };
+
+    template<typename MemDT, typename DestDT>
+    GPUStaticInst*
+    decodeLd2(const Brig::BrigInstBase *ib, const BrigObject *obj)
+    {
+        unsigned op_offs = obj->getOperandPtr(ib->operands,1);
+        BrigRegOperandInfo tmp = findRegDataType(op_offs, obj);
+
+        if (tmp.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) {
+            return new LdInst<MemDT, DestDT, NoRegAddrOperand>(ib, obj, "ld");
+        } else if (tmp.kind == Brig::BRIG_KIND_OPERAND_REGISTER ||
+                   tmp.kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST) {
+            switch (tmp.regKind) {
+              case Brig::BRIG_REGISTER_KIND_SINGLE:
+                return new LdInst<MemDT, DestDT,
+                                  SRegAddrOperand>(ib, obj, "ld");
+              case Brig::BRIG_REGISTER_KIND_DOUBLE:
+                return new LdInst<MemDT, DestDT,
+                                  DRegAddrOperand>(ib, obj, "ld");
+              default:
+                fatal("Bad ld register operand type %d\n", tmp.regKind);
+            }
+        } else {
+            fatal("Bad ld register operand kind %d\n", tmp.kind);
+        }
+    }
+
+    template<typename MemDT>
+    GPUStaticInst*
+    decodeLd(const Brig::BrigInstBase *ib, const BrigObject *obj)
+    {
+        unsigned op_offs = obj->getOperandPtr(ib->operands,0);
+        BrigRegOperandInfo dest = findRegDataType(op_offs, obj);
+
+        assert(dest.kind == Brig::BRIG_KIND_OPERAND_REGISTER ||
+               dest.kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST);
+        switch(dest.regKind) {
+          case Brig::BRIG_REGISTER_KIND_SINGLE:
+            switch (ib->type) {
+              case Brig::BRIG_TYPE_B8:
+              case Brig::BRIG_TYPE_B16:
+              case Brig::BRIG_TYPE_B32:
+                return decodeLd2<MemDT, B32>(ib, obj);
+              case Brig::BRIG_TYPE_U8:
+              case Brig::BRIG_TYPE_U16:
+              case Brig::BRIG_TYPE_U32:
+                return decodeLd2<MemDT, U32>(ib, obj);
+              case Brig::BRIG_TYPE_S8:
+              case Brig::BRIG_TYPE_S16:
+              case Brig::BRIG_TYPE_S32:
+                return decodeLd2<MemDT, S32>(ib, obj);
+              case Brig::BRIG_TYPE_F16:
+              case Brig::BRIG_TYPE_F32:
+                return decodeLd2<MemDT, U32>(ib, obj);
+              default:
+                fatal("Bad ld register operand type %d, %d\n",
+                      dest.regKind, ib->type);
+            };
+          case Brig::BRIG_REGISTER_KIND_DOUBLE:
+            switch (ib->type) {
+              case Brig::BRIG_TYPE_B64:
+                return decodeLd2<MemDT, B64>(ib, obj);
+              case Brig::BRIG_TYPE_U64:
+                return decodeLd2<MemDT, U64>(ib, obj);
+              case Brig::BRIG_TYPE_S64:
+                return decodeLd2<MemDT, S64>(ib, obj);
+              case Brig::BRIG_TYPE_F64:
+                return decodeLd2<MemDT, U64>(ib, obj);
+              default:
+                fatal("Bad ld register operand type %d, %d\n",
+                      dest.regKind, ib->type);
+            };
+          default:
+            fatal("Bad ld register operand type %d, %d\n", dest.regKind,
+                  ib->type);
+        }
+    }
+
+    template<typename MemDataType, typename SrcOperandType,
+             typename AddrOperandType>
+    class StInstBase : public HsailGPUStaticInst
+    {
+      public:
+        typename SrcOperandType::SrcOperand src;
+        AddrOperandType addr;
+
+        Brig::BrigSegment segment;
+        Brig::BrigMemoryScope memoryScope;
+        Brig::BrigMemoryOrder memoryOrder;
+        unsigned int equivClass;
+
+        void
+        initSt(const Brig::BrigInstBase *ib, const BrigObject *obj,
+               const char *_opcode)
+        {
+            using namespace Brig;
+
+            const BrigInstMem *ldst = (const BrigInstMem*)ib;
+
+            segment = (BrigSegment)ldst->segment;
+            memoryOrder = BRIG_MEMORY_ORDER_NONE;
+            memoryScope = BRIG_MEMORY_SCOPE_NONE;
+            equivClass = ldst->equivClass;
+
+            switch (segment) {
+              case BRIG_SEGMENT_GLOBAL:
+                o_type = Enums::OT_GLOBAL_WRITE;
+                break;
+
+              case BRIG_SEGMENT_GROUP:
+                o_type = Enums::OT_SHARED_WRITE;
+                break;
+
+              case BRIG_SEGMENT_PRIVATE:
+                o_type = Enums::OT_PRIVATE_WRITE;
+                break;
+
+              case BRIG_SEGMENT_READONLY:
+                o_type = Enums::OT_READONLY_WRITE;
+                break;
+
+              case BRIG_SEGMENT_SPILL:
+                o_type = Enums::OT_SPILL_WRITE;
+                break;
+
+              case BRIG_SEGMENT_FLAT:
+                o_type = Enums::OT_FLAT_WRITE;
+                break;
+
+              case BRIG_SEGMENT_ARG:
+                o_type = Enums::OT_ARG;
+                break;
+
+              default:
+                panic("St: segment %d not supported\n", segment);
+            }
+
+            unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+            const BrigOperand *baseOp = obj->getOperand(op_offs);
+
+            if ((baseOp->kind == BRIG_KIND_OPERAND_CONSTANT_BYTES) ||
+                (baseOp->kind == BRIG_KIND_OPERAND_REGISTER)) {
+                src.init(op_offs, obj);
+            }
+
+            op_offs = obj->getOperandPtr(ib->operands, 1);
+            addr.init(op_offs, obj);
+        }
+
+        void
+        initAtomicSt(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                     const char *_opcode)
+        {
+            using namespace Brig;
+
+            const BrigInstAtomic *at = (const BrigInstAtomic*)ib;
+
+            segment = (BrigSegment)at->segment;
+            memoryScope = (BrigMemoryScope)at->memoryScope;
+            memoryOrder = (BrigMemoryOrder)at->memoryOrder;
+            equivClass = 0;
+
+            switch (segment) {
+              case BRIG_SEGMENT_GLOBAL:
+                o_type = Enums::OT_GLOBAL_WRITE;
+                break;
+
+              case BRIG_SEGMENT_GROUP:
+                o_type = Enums::OT_SHARED_WRITE;
+                break;
+
+              case BRIG_SEGMENT_PRIVATE:
+                o_type = Enums::OT_PRIVATE_WRITE;
+                break;
+
+              case BRIG_SEGMENT_READONLY:
+                o_type = Enums::OT_READONLY_WRITE;
+                break;
+
+              case BRIG_SEGMENT_SPILL:
+                o_type = Enums::OT_SPILL_WRITE;
+                break;
+
+              case BRIG_SEGMENT_FLAT:
+                o_type = Enums::OT_FLAT_WRITE;
+                break;
+
+              case BRIG_SEGMENT_ARG:
+                o_type = Enums::OT_ARG;
+                break;
+
+              default:
+                panic("St: segment %d not supported\n", segment);
+            }
+
+            unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+            addr.init(op_offs, obj);
+
+            op_offs = obj->getOperandPtr(ib->operands, 1);
+            src.init(op_offs, obj);
+        }
+
+        StInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                   const char *_opcode)
+           : HsailGPUStaticInst(obj, _opcode)
+        {
+            using namespace Brig;
+
+            if (ib->opcode == BRIG_OPCODE_ST) {
+                initSt(ib, obj, _opcode);
+            } else {
+                initAtomicSt(ib, obj, _opcode);
+            }
+        }
+
+        int numDstRegOperands() { return 0; }
+        int numSrcRegOperands()
+        {
+            return src.isVectorRegister() + this->addr.isVectorRegister();
+        }
+        int getNumOperands()
+        {
+            if (this->addr.isVectorRegister() || this->addr.isScalarRegister())
+                return 2;
+            else
+                return 1;
+        }
+        bool isVectorRegister(int operandIndex)
+        {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            return !operandIndex ? src.isVectorRegister() :
+                   this->addr.isVectorRegister();
+        }
+        bool isCondRegister(int operandIndex)
+        {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            return !operandIndex ? src.isCondRegister() :
+                   this->addr.isCondRegister();
+        }
+        bool isScalarRegister(int operandIndex)
+        {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            return !operandIndex ? src.isScalarRegister() :
+                   this->addr.isScalarRegister();
+        }
+        bool isSrcOperand(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return true;
+        }
+        bool isDstOperand(int operandIndex) { return false; }
+        int getOperandSize(int operandIndex)
+        {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            return !operandIndex ? src.opSize() : this->addr.opSize();
+        }
+        int getRegisterIndex(int operandIndex)
+        {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            return !operandIndex ? src.regIndex() : this->addr.regIndex();
+        }
+    };
+
+
+    template<typename MemDataType, typename SrcDataType,
+             typename AddrOperandType>
+    class StInst :
+        public StInstBase<MemDataType, typename SrcDataType::OperandType,
+                          AddrOperandType>,
+        public MemInst
+    {
+      public:
+        typename SrcDataType::OperandType::SrcOperand src_vect[4];
+        uint16_t num_src_operands;
+        void generateDisassembly();
+
+        StInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                        const char *_opcode, int srcIdx)
+            : StInstBase<MemDataType, typename SrcDataType::OperandType,
+                         AddrOperandType>(ib, obj, _opcode),
+              MemInst(SrcDataType::memType)
+        {
+            init_addr(&this->addr);
+
+            BrigRegOperandInfo rinfo;
+            unsigned op_offs = obj->getOperandPtr(ib->operands,srcIdx);
+            const Brig::BrigOperand *baseOp = obj->getOperand(op_offs);
+
+            if (baseOp->kind == Brig::BRIG_KIND_OPERAND_CONSTANT_BYTES) {
+                const Brig::BrigOperandConstantBytes *op =
+                    (Brig::BrigOperandConstantBytes*)baseOp;
+
+                rinfo = BrigRegOperandInfo((Brig::BrigKind16_t)op->base.kind,
+                                           Brig::BRIG_TYPE_NONE);
+            } else {
+                rinfo = findRegDataType(op_offs, obj);
+            }
+
+            if (baseOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST) {
+                const Brig::BrigOperandOperandList *brigRegVecOp =
+                    (const Brig::BrigOperandOperandList*)baseOp;
+
+                num_src_operands =
+                    *((unsigned*)obj->getData(brigRegVecOp->elements)) / 4;
+
+                assert(num_src_operands <= 4);
+            } else {
+                num_src_operands = 1;
+            }
+
+            if (num_src_operands > 1) {
+                assert(baseOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST);
+
+                for (int i = 0; i < num_src_operands; ++i) {
+                    src_vect[i].init_from_vect(op_offs, obj, i);
+                }
+            }
+        }
+
+        void
+        initiateAcc(GPUDynInstPtr gpuDynInst) override
+        {
+            // before performing a store, check if this store has
+            // release semantics, and if so issue a release first
+            if (!isLocalMem()) {
+                if (gpuDynInst->computeUnit()->shader->separate_acquire_release
+                    && gpuDynInst->memoryOrder ==
+                    Enums::MEMORY_ORDER_SC_RELEASE) {
+
+                    gpuDynInst->statusBitVector = VectorMask(1);
+                    gpuDynInst->execContinuation = &GPUStaticInst::execSt;
+                    gpuDynInst->useContinuation = true;
+                    // create request
+                    Request *req = new Request(0, 0, 0, 0,
+                                  gpuDynInst->computeUnit()->masterId(),
+                                  0, gpuDynInst->wfDynId, -1);
+                    req->setFlags(Request::RELEASE);
+                    gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req);
+
+                    return;
+                }
+            }
+
+            // if there is no release semantic, perform stores immediately
+            execSt(gpuDynInst);
+        }
+
+        bool
+        isLocalMem() const override
+        {
+            return this->segment == Brig::BRIG_SEGMENT_GROUP;
+        }
+
+      private:
+        // execSt may be called through a continuation
+        // if the store had release semantics. see comment for
+        // execSt in gpu_static_inst.hh
+        void
+        execSt(GPUDynInstPtr gpuDynInst) override
+        {
+            typedef typename MemDataType::CType c0;
+
+            gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
+
+            if (num_src_operands > 1) {
+                for (int i = 0; i < VSZ; ++i)
+                    if (gpuDynInst->exec_mask[i])
+                        gpuDynInst->statusVector.push_back(num_src_operands);
+                    else
+                        gpuDynInst->statusVector.push_back(0);
+            }
+
+            for (int k = 0; k < num_src_operands; ++k) {
+                c0 *d = &((c0*)gpuDynInst->d_data)[k * VSZ];
+
+                for (int i = 0; i < VSZ; ++i) {
+                    if (gpuDynInst->exec_mask[i]) {
+                        Addr vaddr = gpuDynInst->addr[i] + k * sizeof(c0);
+
+                        if (isLocalMem()) {
+                            //store to shared memory
+                            gpuDynInst->wavefront()->ldsChunk->write<c0>(vaddr,
+                                                                         *d);
+                        } else {
+                            Request *req =
+                              new Request(0, vaddr, sizeof(c0), 0,
+                                          gpuDynInst->computeUnit()->masterId(),
+                                          0, gpuDynInst->wfDynId, i);
+
+                            gpuDynInst->setRequestFlags(req);
+                            PacketPtr pkt = new Packet(req, MemCmd::WriteReq);
+                            pkt->dataStatic<c0>(d);
+
+                            // translation is performed in sendRequest()
+                            // the request will be finished when the store completes
+                            gpuDynInst->useContinuation = false;
+                            gpuDynInst->computeUnit()->sendRequest(gpuDynInst,
+                                                                   i, pkt);
+
+                        }
+                    }
+                    ++d;
+                }
+            }
+
+            gpuDynInst->updateStats();
+        }
+
+      public:
+        bool isVectorRegister(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex == num_src_operands)
+                return this->addr.isVectorRegister();
+            if (num_src_operands > 1)
+                return src_vect[operandIndex].isVectorRegister();
+            else if (num_src_operands == 1)
+                return StInstBase<MemDataType,
+                       typename SrcDataType::OperandType,
+                       AddrOperandType>::src.isVectorRegister();
+            return false;
+        }
+        bool isCondRegister(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex == num_src_operands)
+                return this->addr.isCondRegister();
+            if (num_src_operands > 1)
+                return src_vect[operandIndex].isCondRegister();
+            else if (num_src_operands == 1)
+                return StInstBase<MemDataType,
+                       typename SrcDataType::OperandType,
+                       AddrOperandType>::src.isCondRegister();
+            return false;
+        }
+        bool isScalarRegister(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex == num_src_operands)
+                return this->addr.isScalarRegister();
+            if (num_src_operands > 1)
+                return src_vect[operandIndex].isScalarRegister();
+            else if (num_src_operands == 1)
+                return StInstBase<MemDataType,
+                       typename SrcDataType::OperandType,
+                       AddrOperandType>::src.isScalarRegister();
+            return false;
+        }
+        bool isSrcOperand(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return true;
+        }
+        bool isDstOperand(int operandIndex) { return false; }
+        int getOperandSize(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex == num_src_operands)
+                return this->addr.opSize();
+            if (num_src_operands > 1)
+                return src_vect[operandIndex].opSize();
+            else if (num_src_operands == 1)
+                return StInstBase<MemDataType,
+                       typename SrcDataType::OperandType,
+                       AddrOperandType>::src.opSize();
+            return 0;
+        }
+        int getRegisterIndex(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex == num_src_operands)
+                return this->addr.regIndex();
+            if (num_src_operands > 1)
+                return src_vect[operandIndex].regIndex();
+            else if (num_src_operands == 1)
+                return StInstBase<MemDataType,
+                       typename SrcDataType::OperandType,
+                       AddrOperandType>::src.regIndex();
+            return -1;
+        }
+        int getNumOperands()
+        {
+            if (this->addr.isVectorRegister() || this->addr.isScalarRegister())
+                return num_src_operands + 1;
+            else
+                return num_src_operands;
+        }
+        void execute(GPUDynInstPtr gpuDynInst);
+    };
+
+    template<typename DataType, typename SrcDataType>
+    GPUStaticInst*
+    decodeSt(const Brig::BrigInstBase *ib, const BrigObject *obj)
+    {
+        int srcIdx = 0;
+        int destIdx = 1;
+        if (ib->opcode == Brig::BRIG_OPCODE_ATOMIC ||
+            ib->opcode == Brig::BRIG_OPCODE_ATOMICNORET) {
+            srcIdx = 1;
+            destIdx = 0;
+        }
+        unsigned op_offs = obj->getOperandPtr(ib->operands,destIdx);
+
+        BrigRegOperandInfo tmp = findRegDataType(op_offs, obj);
+
+        if (tmp.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) {
+            return new StInst<DataType, SrcDataType,
+                              NoRegAddrOperand>(ib, obj, "st", srcIdx);
+        } else if (tmp.kind == Brig::BRIG_KIND_OPERAND_REGISTER) {
+            // V2/V4 not allowed
+            switch (tmp.regKind) {
+              case Brig::BRIG_REGISTER_KIND_SINGLE:
+                return new StInst<DataType, SrcDataType,
+                                  SRegAddrOperand>(ib, obj, "st", srcIdx);
+              case Brig::BRIG_REGISTER_KIND_DOUBLE:
+                return new StInst<DataType, SrcDataType,
+                                  DRegAddrOperand>(ib, obj, "st", srcIdx);
+              default:
+                fatal("Bad st register operand type %d\n", tmp.type);
+            }
+        } else {
+            fatal("Bad st register operand kind %d\n", tmp.kind);
+        }
+    }
+
+    Enums::MemOpType brigAtomicToMemOpType(Brig::BrigOpcode brigOpCode,
+                                           Brig::BrigAtomicOperation brigOp);
+
+    template<typename OperandType, typename AddrOperandType, int NumSrcOperands,
+             bool HasDst>
+    class AtomicInstBase : public HsailGPUStaticInst
+    {
+      public:
+        typename OperandType::DestOperand dest;
+        typename OperandType::SrcOperand src[NumSrcOperands];
+        AddrOperandType addr;
+
+        Brig::BrigSegment segment;
+        Brig::BrigMemoryOrder memoryOrder;
+        Brig::BrigAtomicOperation atomicOperation;
+        Brig::BrigMemoryScope memoryScope;
+        Brig::BrigOpcode opcode;
+        Enums::MemOpType opType;
+
+        AtomicInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                       const char *_opcode)
+           : HsailGPUStaticInst(obj, _opcode)
+        {
+            using namespace Brig;
+
+            const BrigInstAtomic *at = (const BrigInstAtomic*)ib;
+
+            segment = (BrigSegment)at->segment;
+            memoryScope = (BrigMemoryScope)at->memoryScope;
+            memoryOrder = (BrigMemoryOrder)at->memoryOrder;
+            atomicOperation = (BrigAtomicOperation)at->atomicOperation;
+            opcode = (BrigOpcode)ib->opcode;
+            opType = brigAtomicToMemOpType(opcode, atomicOperation);
+
+            switch (segment) {
+              case BRIG_SEGMENT_GLOBAL:
+                o_type = Enums::OT_GLOBAL_ATOMIC;
+                break;
+
+              case BRIG_SEGMENT_GROUP:
+                o_type = Enums::OT_SHARED_ATOMIC;
+                break;
+
+              case BRIG_SEGMENT_FLAT:
+                o_type = Enums::OT_FLAT_ATOMIC;
+                break;
+
+              default:
+                panic("Atomic: segment %d not supported\n", segment);
+            }
+
+            if (HasDst) {
+                unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+                dest.init(op_offs, obj);
+
+                op_offs = obj->getOperandPtr(ib->operands, 1);
+                addr.init(op_offs, obj);
+
+                for (int i = 0; i < NumSrcOperands; ++i) {
+                    op_offs = obj->getOperandPtr(ib->operands, i + 2);
+                    src[i].init(op_offs, obj);
+                }
+            } else {
+
+                unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+                addr.init(op_offs, obj);
+
+                for (int i = 0; i < NumSrcOperands; ++i) {
+                    op_offs = obj->getOperandPtr(ib->operands, i + 1);
+                    src[i].init(op_offs, obj);
+                }
+            }
+        }
+
+        int numSrcRegOperands()
+        {
+            int operands = 0;
+            for (int i = 0; i < NumSrcOperands; i++) {
+                if (src[i].isVectorRegister() == true) {
+                    operands++;
+                }
+            }
+            if (addr.isVectorRegister())
+                operands++;
+            return operands;
+        }
+        int numDstRegOperands() { return dest.isVectorRegister(); }
+        int getNumOperands()
+        {
+            if (addr.isVectorRegister())
+                return(NumSrcOperands + 2);
+            return(NumSrcOperands + 1);
+        }
+        bool isVectorRegister(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex < NumSrcOperands)
+                return src[operandIndex].isVectorRegister();
+            else if (operandIndex == NumSrcOperands)
+                return(addr.isVectorRegister());
+            else
+                return dest.isVectorRegister();
+        }
+        bool isCondRegister(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex < NumSrcOperands)
+                return src[operandIndex].isCondRegister();
+            else if (operandIndex == NumSrcOperands)
+                return(addr.isCondRegister());
+            else
+                return dest.isCondRegister();
+        }
+        bool isScalarRegister(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex < NumSrcOperands)
+                return src[operandIndex].isScalarRegister();
+            else if (operandIndex == NumSrcOperands)
+                return(addr.isScalarRegister());
+            else
+                return dest.isScalarRegister();
+        }
+        bool isSrcOperand(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex < NumSrcOperands)
+                return true;
+            else if (operandIndex == NumSrcOperands)
+                return(addr.isVectorRegister());
+            else
+                return false;
+        }
+        bool isDstOperand(int operandIndex)
+        {
+            if (operandIndex <= NumSrcOperands)
+                return false;
+            else
+                return true;
+        }
+        int getOperandSize(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex < NumSrcOperands)
+                return(src[operandIndex].opSize());
+            else if (operandIndex == NumSrcOperands)
+                return(addr.opSize());
+            else
+                return(dest.opSize());
+        }
+        int getRegisterIndex(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex < NumSrcOperands)
+                return(src[operandIndex].regIndex());
+            else if (operandIndex == NumSrcOperands)
+                return(addr.regIndex());
+            else
+                return(dest.regIndex());
+            return -1;
+        }
+    };
+
+    template<typename MemDataType, typename AddrOperandType, int NumSrcOperands,
+             bool HasDst>
+    class AtomicInst :
+        public AtomicInstBase<typename MemDataType::OperandType,
+                              AddrOperandType, NumSrcOperands, HasDst>,
+        public MemInst
+    {
+      public:
+        void generateDisassembly();
+
+        AtomicInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                   const char *_opcode)
+            : AtomicInstBase<typename MemDataType::OperandType, AddrOperandType,
+                             NumSrcOperands, HasDst>
+                (ib, obj, _opcode),
+              MemInst(MemDataType::memType)
+        {
+            init_addr(&this->addr);
+        }
+
+        void
+        initiateAcc(GPUDynInstPtr gpuDynInst) override
+        {
+            // before doing the RMW, check if this atomic has
+            // release semantics, and if so issue a release first
+            if (!isLocalMem()) {
+                if (gpuDynInst->computeUnit()->shader->separate_acquire_release
+                    && (gpuDynInst->memoryOrder ==
+                    Enums::MEMORY_ORDER_SC_RELEASE || gpuDynInst->memoryOrder ==
+                    Enums::MEMORY_ORDER_SC_ACQUIRE_RELEASE)) {
+
+                    gpuDynInst->statusBitVector = VectorMask(1);
+
+                    gpuDynInst->execContinuation = &GPUStaticInst::execAtomic;
+                    gpuDynInst->useContinuation = true;
+
+                    // create request
+                    Request *req = new Request(0, 0, 0, 0,
+                                  gpuDynInst->computeUnit()->masterId(),
+                                  0, gpuDynInst->wfDynId, -1);
+                    req->setFlags(Request::RELEASE);
+                    gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req);
+
+                    return;
+                }
+            }
+
+            // if there is no release semantic, execute the RMW immediately
+            execAtomic(gpuDynInst);
+
+        }
+
+        void execute(GPUDynInstPtr gpuDynInst);
+
+        bool
+        isLocalMem() const override
+        {
+            return this->segment == Brig::BRIG_SEGMENT_GROUP;
+        }
+
+      private:
+        // execAtomic may be called through a continuation
+        // if the RMW had release semantics. see comment for
+        // execContinuation in gpu_dyn_inst.hh
+        void
+        execAtomic(GPUDynInstPtr gpuDynInst) override
+        {
+            gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
+
+            typedef typename MemDataType::CType c0;
+
+            c0 *d = &((c0*) gpuDynInst->d_data)[0];
+            c0 *e = &((c0*) gpuDynInst->a_data)[0];
+            c0 *f = &((c0*) gpuDynInst->x_data)[0];
+
+            for (int i = 0; i < VSZ; ++i) {
+                if (gpuDynInst->exec_mask[i]) {
+                    Addr vaddr = gpuDynInst->addr[i];
+
+                    if (isLocalMem()) {
+                        Wavefront *wavefront = gpuDynInst->wavefront();
+                        *d = wavefront->ldsChunk->read<c0>(vaddr);
+
+                        switch (this->opType) {
+                          case Enums::MO_AADD:
+                          case Enums::MO_ANRADD:
+                            wavefront->ldsChunk->write<c0>(vaddr,
+                            wavefront->ldsChunk->read<c0>(vaddr) + (*e));
+                            break;
+                          case Enums::MO_ASUB:
+                          case Enums::MO_ANRSUB:
+                            wavefront->ldsChunk->write<c0>(vaddr,
+                            wavefront->ldsChunk->read<c0>(vaddr) - (*e));
+                            break;
+                          case Enums::MO_AMAX:
+                          case Enums::MO_ANRMAX:
+                            wavefront->ldsChunk->write<c0>(vaddr,
+                            std::max(wavefront->ldsChunk->read<c0>(vaddr),
+                            (*e)));
+                            break;
+                          case Enums::MO_AMIN:
+                          case Enums::MO_ANRMIN:
+                            wavefront->ldsChunk->write<c0>(vaddr,
+                            std::min(wavefront->ldsChunk->read<c0>(vaddr),
+                            (*e)));
+                            break;
+                          case Enums::MO_AAND:
+                          case Enums::MO_ANRAND:
+                            wavefront->ldsChunk->write<c0>(vaddr,
+                            wavefront->ldsChunk->read<c0>(vaddr) & (*e));
+                            break;
+                          case Enums::MO_AOR:
+                          case Enums::MO_ANROR:
+                            wavefront->ldsChunk->write<c0>(vaddr,
+                            wavefront->ldsChunk->read<c0>(vaddr) | (*e));
+                            break;
+                          case Enums::MO_AXOR:
+                          case Enums::MO_ANRXOR:
+                            wavefront->ldsChunk->write<c0>(vaddr,
+                            wavefront->ldsChunk->read<c0>(vaddr) ^ (*e));
+                            break;
+                          case Enums::MO_AINC:
+                          case Enums::MO_ANRINC:
+                            wavefront->ldsChunk->write<c0>(vaddr,
+                            wavefront->ldsChunk->read<c0>(vaddr) + 1);
+                            break;
+                          case Enums::MO_ADEC:
+                          case Enums::MO_ANRDEC:
+                            wavefront->ldsChunk->write<c0>(vaddr,
+                            wavefront->ldsChunk->read<c0>(vaddr) - 1);
+                            break;
+                          case Enums::MO_AEXCH:
+                          case Enums::MO_ANREXCH:
+                            wavefront->ldsChunk->write<c0>(vaddr, (*e));
+                            break;
+                          case Enums::MO_ACAS:
+                          case Enums::MO_ANRCAS:
+                            wavefront->ldsChunk->write<c0>(vaddr,
+                            (wavefront->ldsChunk->read<c0>(vaddr) == (*e)) ?
+                            (*f) : wavefront->ldsChunk->read<c0>(vaddr));
+                            break;
+                          default:
+                            fatal("Unrecognized or invalid HSAIL atomic op "
+                                  "type.\n");
+                            break;
+                        }
+                    } else {
+                        Request *req =
+                            new Request(0, vaddr, sizeof(c0), 0,
+                                        gpuDynInst->computeUnit()->masterId(),
+                                        0, gpuDynInst->wfDynId, i,
+                                        gpuDynInst->makeAtomicOpFunctor<c0>(e,
+                                        f, this->opType));
+
+                        gpuDynInst->setRequestFlags(req);
+                        PacketPtr pkt = new Packet(req, MemCmd::SwapReq);
+                        pkt->dataStatic(d);
+
+                        if (gpuDynInst->computeUnit()->shader->
+                            separate_acquire_release &&
+                            (gpuDynInst->memoryOrder ==
+                             Enums::MEMORY_ORDER_SC_ACQUIRE)) {
+                            // if this atomic has acquire semantics,
+                            // schedule the continuation to perform an
+                            // acquire after the RMW completes
+                            gpuDynInst->execContinuation =
+                                &GPUStaticInst::execAtomicAcq;
+
+                            gpuDynInst->useContinuation = true;
+                        } else {
+                            // the request will be finished when the RMW completes
+                            gpuDynInst->useContinuation = false;
+                        }
+                        // translation is performed in sendRequest()
+                        gpuDynInst->computeUnit()->sendRequest(gpuDynInst, i,
+                                                               pkt);
+                    }
+                }
+
+                ++d;
+                ++e;
+                ++f;
+            }
+
+            gpuDynInst->updateStats();
+        }
+
+        // execAtomicACq will always be called through a continuation.
+        // see comment for execContinuation in gpu_dyn_inst.hh
+        void
+        execAtomicAcq(GPUDynInstPtr gpuDynInst) override
+        {
+            // after performing the RMW, check to see if this instruction
+            // has acquire semantics, and if so, issue an acquire
+            if (!isLocalMem()) {
+                if (gpuDynInst->computeUnit()->shader->separate_acquire_release
+                     && gpuDynInst->memoryOrder ==
+                     Enums::MEMORY_ORDER_SC_ACQUIRE) {
+                    gpuDynInst->statusBitVector = VectorMask(1);
+
+                    // the request will be finished when
+                    // the acquire completes
+                    gpuDynInst->useContinuation = false;
+                    // create request
+                    Request *req = new Request(0, 0, 0, 0,
+                                  gpuDynInst->computeUnit()->masterId(),
+                                  0, gpuDynInst->wfDynId, -1);
+                    req->setFlags(Request::ACQUIRE);
+                    gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req);
+                }
+            }
+        }
+    };
+
+    template<typename DataType, typename AddrOperandType, int NumSrcOperands>
+    GPUStaticInst*
+    constructAtomic(const Brig::BrigInstBase *ib, const BrigObject *obj)
+    {
+        const Brig::BrigInstAtomic *at = (const Brig::BrigInstAtomic*)ib;
+
+        if (at->atomicOperation == Brig::BRIG_ATOMIC_LD) {
+            return decodeLd<DataType>(ib, obj);
+        } else if (at->atomicOperation == Brig::BRIG_ATOMIC_ST) {
+            switch (ib->type) {
+              case Brig::BRIG_TYPE_B8:
+                return decodeSt<S8,S8>(ib, obj);
+              case Brig::BRIG_TYPE_B16:
+                return decodeSt<S8,S16>(ib, obj);
+              case Brig::BRIG_TYPE_B32:
+                return decodeSt<S8,S32>(ib, obj);
+              case Brig::BRIG_TYPE_B64:
+                return decodeSt<S8,S64>(ib, obj);
+              default: fatal("AtomicSt: Operand type mismatch %d\n", ib->type);
+            }
+        } else {
+            if ((Brig::BrigOpcode)ib->opcode == Brig::BRIG_OPCODE_ATOMICNORET)
+                return new AtomicInst<DataType, AddrOperandType,
+                    NumSrcOperands, false>(ib, obj, "atomicnoret");
+            else
+                return new AtomicInst<DataType, AddrOperandType,
+                    NumSrcOperands, true>(ib, obj, "atomic");
+        }
+    }
+
+    template<typename DataType, int NumSrcOperands>
+    GPUStaticInst*
+    decodeAtomicHelper(const Brig::BrigInstBase *ib, const BrigObject *obj)
+    {
+        unsigned addrIndex = (Brig::BrigOpcode)ib->opcode ==
+            Brig::BRIG_OPCODE_ATOMICNORET ? 0 : 1;
+
+        unsigned op_offs = obj->getOperandPtr(ib->operands,addrIndex);
+
+        BrigRegOperandInfo tmp = findRegDataType(op_offs, obj);
+
+        if (tmp.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) {
+            return constructAtomic<DataType, NoRegAddrOperand,
+                                   NumSrcOperands>(ib, obj);
+        } else if (tmp.kind == Brig::BRIG_KIND_OPERAND_REGISTER) {
+            // V2/V4 not allowed
+            switch (tmp.regKind) {
+              case Brig::BRIG_REGISTER_KIND_SINGLE:
+                  return constructAtomic<DataType, SRegAddrOperand,
+                                         NumSrcOperands>(ib, obj);
+              case Brig::BRIG_REGISTER_KIND_DOUBLE:
+                return constructAtomic<DataType, DRegAddrOperand,
+                                       NumSrcOperands>(ib, obj);
+              default:
+                fatal("Bad atomic register operand type %d\n", tmp.type);
+            }
+        } else {
+            fatal("Bad atomic register operand kind %d\n", tmp.kind);
+        }
+    }
+
+
+    template<typename DataType>
+    GPUStaticInst*
+    decodeAtomic(const Brig::BrigInstBase *ib, const BrigObject *obj)
+    {
+        const Brig::BrigInstAtomic *at = (const Brig::BrigInstAtomic*)ib;
+
+        if (at->atomicOperation == Brig::BRIG_ATOMIC_CAS) {
+            return decodeAtomicHelper<DataType, 2>(ib, obj);
+        } else {
+            return decodeAtomicHelper<DataType, 1>(ib, obj);
+        }
+    }
+
+    template<typename DataType>
+    GPUStaticInst*
+    decodeAtomicNoRet(const Brig::BrigInstBase *ib, const BrigObject *obj)
+    {
+        const Brig::BrigInstAtomic *at = (const Brig::BrigInstAtomic*)ib;
+        if (at->atomicOperation == Brig::BRIG_ATOMIC_CAS) {
+            return decodeAtomicHelper<DataType, 2>(ib, obj);
+        } else {
+            return decodeAtomicHelper<DataType, 1>(ib, obj);
+        }
+    }
+} // namespace HsailISA
+
+#endif // __ARCH_HSAIL_INSTS_MEM_HH__
diff --git a/src/arch/hsail/insts/mem_impl.hh b/src/arch/hsail/insts/mem_impl.hh
new file mode 100644
index 000000000..94f0cd6aa
--- /dev/null
+++ b/src/arch/hsail/insts/mem_impl.hh
@@ -0,0 +1,660 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#include "arch/hsail/generic_types.hh"
+#include "gpu-compute/hsail_code.hh"
+
+// defined in code.cc, but not worth sucking in all of code.h for this
+// at this point
+extern const char *segmentNames[];
+
+namespace HsailISA
+{
+    template<typename DestDataType, typename AddrRegOperandType>
+    void
+    LdaInst<DestDataType, AddrRegOperandType>::generateDisassembly()
+    {
+        this->disassembly = csprintf("%s_%s %s,%s", this->opcode,
+                                     DestDataType::label,
+                                     this->dest.disassemble(),
+                                     this->addr.disassemble());
+    }
+
+    template<typename DestDataType, typename AddrRegOperandType>
+    void
+    LdaInst<DestDataType, AddrRegOperandType>::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *w = gpuDynInst->wavefront();
+
+        typedef typename DestDataType::CType CType M5_VAR_USED;
+        const VectorMask &mask = w->get_pred();
+        uint64_t addr_vec[VSZ];
+        this->addr.calcVector(w, addr_vec);
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (mask[lane]) {
+                this->dest.set(w, lane, addr_vec[lane]);
+            }
+        }
+    }
+
+    template<typename MemDataType, typename DestDataType,
+             typename AddrRegOperandType>
+    void
+    LdInst<MemDataType, DestDataType, AddrRegOperandType>::generateDisassembly()
+    {
+        switch (num_dest_operands) {
+          case 1:
+            this->disassembly = csprintf("%s_%s_%s %s,%s", this->opcode,
+                                         segmentNames[this->segment],
+                                         MemDataType::label,
+                                         this->dest.disassemble(),
+                                         this->addr.disassemble());
+            break;
+          case 2:
+            this->disassembly = csprintf("%s_%s_%s (%s,%s), %s", this->opcode,
+                                         segmentNames[this->segment],
+                                         MemDataType::label,
+                                         this->dest_vect[0].disassemble(),
+                                         this->dest_vect[1].disassemble(),
+                                         this->addr.disassemble());
+            break;
+          case 4:
+            this->disassembly = csprintf("%s_%s_%s (%s,%s,%s,%s), %s",
+                                         this->opcode,
+                                         segmentNames[this->segment],
+                                         MemDataType::label,
+                                         this->dest_vect[0].disassemble(),
+                                         this->dest_vect[1].disassemble(),
+                                         this->dest_vect[2].disassemble(),
+                                         this->dest_vect[3].disassemble(),
+                                         this->addr.disassemble());
+            break;
+          default:
+            fatal("Bad ld register dest operand, num vector operands: %d \n",
+                  num_dest_operands);
+            break;
+        }
+    }
+
+    static Addr
+    calcPrivAddr(Addr addr, Wavefront *w, int lane, GPUStaticInst *i)
+    {
+        // what is the size of the object we are accessing??
+        // NOTE: the compiler doesn't generate enough information
+        // to do this yet..have to just line up all the private
+        // work-item spaces back to back for now
+        /*
+        StorageElement* se =
+            i->parent->findSymbol(Brig::BrigPrivateSpace, addr);
+        assert(se);
+
+        return w->wfSlotId * w->privSizePerItem * VSZ +
+            se->offset * VSZ +
+            lane * se->size;
+        */
+
+        // addressing strategy: interleave the private spaces of
+        // work-items in a wave-front on 8 byte granularity.
+        // this won't be perfect coalescing like the spill space
+        // strategy, but it's better than nothing. The spill space
+        // strategy won't work with private because the same address
+        // may be accessed by different sized loads/stores.
+
+        // Note: I'm assuming that the largest load/store to private
+        // is 8 bytes. If it is larger, the stride will have to increase
+
+        Addr addr_div8 = addr / 8;
+        Addr addr_mod8 = addr % 8;
+
+        Addr ret = addr_div8 * 8 * VSZ + lane * 8 + addr_mod8 + w->privBase;
+
+        assert(ret < w->privBase + (w->privSizePerItem * VSZ));
+
+        return ret;
+    }
+
+    template<typename MemDataType, typename DestDataType,
+             typename AddrRegOperandType>
+    void
+    LdInst<MemDataType, DestDataType,
+           AddrRegOperandType>::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *w = gpuDynInst->wavefront();
+
+        typedef typename MemDataType::CType MemCType;
+        const VectorMask &mask = w->get_pred();
+
+        // Kernarg references are handled uniquely for now (no Memory Request
+        // is used), so special-case them up front.  Someday we should
+        // make this more realistic, at which we should get rid of this
+        // block and fold this case into the switch below.
+        if (this->segment == Brig::BRIG_SEGMENT_KERNARG) {
+            MemCType val;
+
+            // I assume no vector ld for kernargs
+            assert(num_dest_operands == 1);
+
+            // assuming for the moment that we'll never do register
+            // offsets into kernarg space... just to make life simpler
+            uint64_t address = this->addr.calcUniform();
+
+            val = *(MemCType*)&w->kernelArgs[address];
+
+            DPRINTF(HSAIL, "ld_kernarg [%d] -> %d\n", address, val);
+
+            for (int lane = 0; lane < VSZ; ++lane) {
+                if (mask[lane]) {
+                    this->dest.set(w, lane, val);
+                }
+            }
+
+            return;
+        } else if (this->segment == Brig::BRIG_SEGMENT_ARG) {
+            uint64_t address = this->addr.calcUniform();
+            for (int lane = 0; lane < VSZ; ++lane) {
+                if (mask[lane]) {
+                    MemCType val = w->readCallArgMem<MemCType>(lane, address);
+
+                    DPRINTF(HSAIL, "ld_arg [%d] -> %llu\n", address,
+                            (unsigned long long)val);
+
+                    this->dest.set(w, lane, val);
+                }
+            }
+
+            return;
+        }
+
+        GPUDynInstPtr m = gpuDynInst;
+
+        this->addr.calcVector(w, m->addr);
+
+        m->m_op = Enums::MO_LD;
+        m->m_type = MemDataType::memType;
+        m->v_type = DestDataType::vgprType;
+
+        m->exec_mask = w->execMask();
+        m->statusBitVector = 0;
+        m->equiv = this->equivClass;
+        m->memoryOrder = getGenericMemoryOrder(this->memoryOrder);
+
+        m->scope = getGenericMemoryScope(this->memoryScope);
+
+        if (num_dest_operands == 1) {
+            m->dst_reg = this->dest.regIndex();
+            m->n_reg = 1;
+        } else {
+            m->n_reg = num_dest_operands;
+            for (int i = 0; i < num_dest_operands; ++i) {
+                m->dst_reg_vec[i] = this->dest_vect[i].regIndex();
+            }
+        }
+
+        m->simdId = w->simdId;
+        m->wfSlotId = w->wfSlotId;
+        m->wfDynId = w->wfDynId;
+        m->kern_id = w->kern_id;
+        m->cu_id = w->computeUnit->cu_id;
+        m->latency.init(&w->computeUnit->shader->tick_cnt);
+
+        switch (this->segment) {
+          case Brig::BRIG_SEGMENT_GLOBAL:
+            m->s_type = SEG_GLOBAL;
+            m->pipeId = GLBMEM_PIPE;
+            m->latency.set(w->computeUnit->shader->ticks(1));
+
+            // this is a complete hack to get around a compiler bug
+            // (the compiler currently generates global access for private
+            //  addresses (starting from 0). We need to add the private offset)
+            for (int lane = 0; lane < VSZ; ++lane) {
+                if (m->addr[lane] < w->privSizePerItem) {
+                    if (mask[lane]) {
+                        // what is the size of the object we are accessing?
+                        // find base for for this wavefront
+
+                        // calcPrivAddr will fail if accesses are unaligned
+                        assert(!((sizeof(MemCType) - 1) & m->addr[lane]));
+
+                        Addr privAddr = calcPrivAddr(m->addr[lane], w, lane,
+                                                     this);
+
+                        m->addr[lane] = privAddr;
+                    }
+                }
+            }
+
+            w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+            w->outstanding_reqs_rd_gm++;
+            w->rd_gm_reqs_in_pipe--;
+            break;
+
+          case Brig::BRIG_SEGMENT_SPILL:
+            assert(num_dest_operands == 1);
+            m->s_type = SEG_SPILL;
+            m->pipeId = GLBMEM_PIPE;
+            m->latency.set(w->computeUnit->shader->ticks(1));
+            {
+                for (int lane = 0; lane < VSZ; ++lane) {
+                    //  note: this calculation will NOT WORK if the compiler
+                    //  ever generates loads/stores to the same address with
+                    //  different widths (e.g., a ld_u32 addr and a ld_u16 addr)
+                    if (mask[lane]) {
+                        assert(m->addr[lane] < w->spillSizePerItem);
+
+                        m->addr[lane] = m->addr[lane] * w->spillWidth +
+                                        lane * sizeof(MemCType) + w->spillBase;
+
+                        w->last_addr[lane] = m->addr[lane];
+                    }
+                }
+            }
+
+            w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+            w->outstanding_reqs_rd_gm++;
+            w->rd_gm_reqs_in_pipe--;
+            break;
+
+          case Brig::BRIG_SEGMENT_GROUP:
+            m->s_type = SEG_SHARED;
+            m->pipeId = LDSMEM_PIPE;
+            m->latency.set(w->computeUnit->shader->ticks(24));
+            w->computeUnit->localMemoryPipe.getLMReqFIFO().push(m);
+            w->outstanding_reqs_rd_lm++;
+            w->rd_lm_reqs_in_pipe--;
+            break;
+
+          case Brig::BRIG_SEGMENT_READONLY:
+            m->s_type = SEG_READONLY;
+            m->pipeId = GLBMEM_PIPE;
+            m->latency.set(w->computeUnit->shader->ticks(1));
+
+            for (int lane = 0; lane < VSZ; ++lane) {
+                if (mask[lane]) {
+                    assert(m->addr[lane] + sizeof(MemCType) <= w->roSize);
+                    m->addr[lane] += w->roBase;
+                }
+            }
+
+            w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+            w->outstanding_reqs_rd_gm++;
+            w->rd_gm_reqs_in_pipe--;
+            break;
+
+          case Brig::BRIG_SEGMENT_PRIVATE:
+            m->s_type = SEG_PRIVATE;
+            m->pipeId = GLBMEM_PIPE;
+            m->latency.set(w->computeUnit->shader->ticks(1));
+            {
+                for (int lane = 0; lane < VSZ; ++lane) {
+                    if (mask[lane]) {
+                        assert(m->addr[lane] < w->privSizePerItem);
+
+                        m->addr[lane] = m->addr[lane] +
+                            lane * sizeof(MemCType) + w->privBase;
+                    }
+                }
+            }
+            w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+            w->outstanding_reqs_rd_gm++;
+            w->rd_gm_reqs_in_pipe--;
+            break;
+
+          default:
+            fatal("Load to unsupported segment %d %llxe\n", this->segment,
+                  m->addr[0]);
+        }
+
+        w->outstanding_reqs++;
+        w->mem_reqs_in_pipe--;
+    }
+
+    template<typename OperationType, typename SrcDataType,
+             typename AddrRegOperandType>
+    void
+    StInst<OperationType, SrcDataType,
+           AddrRegOperandType>::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *w = gpuDynInst->wavefront();
+
+        typedef typename OperationType::CType CType;
+
+        const VectorMask &mask = w->get_pred();
+
+        // arg references are handled uniquely for now (no Memory Request
+        // is used), so special-case them up front.  Someday we should
+        // make this more realistic, at which we should get rid of this
+        // block and fold this case into the switch below.
+        if (this->segment == Brig::BRIG_SEGMENT_ARG) {
+            uint64_t address = this->addr.calcUniform();
+
+            for (int lane = 0; lane < VSZ; ++lane) {
+                if (mask[lane]) {
+                    CType data = this->src.template get<CType>(w, lane);
+                    DPRINTF(HSAIL, "st_arg [%d] <- %d\n", address, data);
+                    w->writeCallArgMem<CType>(lane, address, data);
+                }
+            }
+
+            return;
+        }
+
+        GPUDynInstPtr m = gpuDynInst;
+
+        m->exec_mask = w->execMask();
+
+        this->addr.calcVector(w, m->addr);
+
+        if (num_src_operands == 1) {
+            for (int lane = 0; lane < VSZ; ++lane) {
+                if (mask[lane]) {
+                    ((CType*)m->d_data)[lane] =
+                        this->src.template get<CType>(w, lane);
+                }
+            }
+        } else {
+            for (int k= 0; k < num_src_operands; ++k) {
+                for (int lane = 0; lane < VSZ; ++lane) {
+                    if (mask[lane]) {
+                        ((CType*)m->d_data)[k * VSZ + lane] =
+                            this->src_vect[k].template get<CType>(w, lane);
+                    }
+                }
+            }
+        }
+
+        m->m_op = Enums::MO_ST;
+        m->m_type = OperationType::memType;
+        m->v_type = OperationType::vgprType;
+
+        m->statusBitVector = 0;
+        m->equiv = this->equivClass;
+
+        if (num_src_operands == 1) {
+            m->n_reg = 1;
+        } else {
+            m->n_reg = num_src_operands;
+        }
+
+        m->memoryOrder = getGenericMemoryOrder(this->memoryOrder);
+
+        m->scope = getGenericMemoryScope(this->memoryScope);
+
+        m->simdId = w->simdId;
+        m->wfSlotId = w->wfSlotId;
+        m->wfDynId = w->wfDynId;
+        m->kern_id = w->kern_id;
+        m->cu_id = w->computeUnit->cu_id;
+        m->latency.init(&w->computeUnit->shader->tick_cnt);
+
+        switch (this->segment) {
+          case Brig::BRIG_SEGMENT_GLOBAL:
+            m->s_type = SEG_GLOBAL;
+            m->pipeId = GLBMEM_PIPE;
+            m->latency.set(w->computeUnit->shader->ticks(1));
+
+            // this is a complete hack to get around a compiler bug
+            // (the compiler currently generates global access for private
+            //  addresses (starting from 0). We need to add the private offset)
+            for (int lane = 0; lane < VSZ; ++lane) {
+                if (mask[lane]) {
+                    if (m->addr[lane] < w->privSizePerItem) {
+
+                        // calcPrivAddr will fail if accesses are unaligned
+                        assert(!((sizeof(CType)-1) & m->addr[lane]));
+
+                        Addr privAddr = calcPrivAddr(m->addr[lane], w, lane,
+                                                     this);
+
+                        m->addr[lane] = privAddr;
+                    }
+                }
+            }
+
+            w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+            w->outstanding_reqs_wr_gm++;
+            w->wr_gm_reqs_in_pipe--;
+            break;
+
+          case Brig::BRIG_SEGMENT_SPILL:
+            assert(num_src_operands == 1);
+            m->s_type = SEG_SPILL;
+            m->pipeId = GLBMEM_PIPE;
+            m->latency.set(w->computeUnit->shader->ticks(1));
+            {
+                for (int lane = 0; lane < VSZ; ++lane) {
+                    if (mask[lane]) {
+                        assert(m->addr[lane] < w->spillSizePerItem);
+
+                        m->addr[lane] = m->addr[lane] * w->spillWidth +
+                                        lane * sizeof(CType) + w->spillBase;
+                    }
+                }
+            }
+
+            w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+            w->outstanding_reqs_wr_gm++;
+            w->wr_gm_reqs_in_pipe--;
+            break;
+
+          case Brig::BRIG_SEGMENT_GROUP:
+            m->s_type = SEG_SHARED;
+            m->pipeId = LDSMEM_PIPE;
+            m->latency.set(w->computeUnit->shader->ticks(24));
+            w->computeUnit->localMemoryPipe.getLMReqFIFO().push(m);
+            w->outstanding_reqs_wr_lm++;
+            w->wr_lm_reqs_in_pipe--;
+            break;
+
+          case Brig::BRIG_SEGMENT_PRIVATE:
+            m->s_type = SEG_PRIVATE;
+            m->pipeId = GLBMEM_PIPE;
+            m->latency.set(w->computeUnit->shader->ticks(1));
+            {
+                for (int lane = 0; lane < VSZ; ++lane) {
+                    if (mask[lane]) {
+                        assert(m->addr[lane] < w->privSizePerItem);
+                        m->addr[lane] = m->addr[lane] + lane *
+                            sizeof(CType)+w->privBase;
+                    }
+                }
+            }
+
+            w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+            w->outstanding_reqs_wr_gm++;
+            w->wr_gm_reqs_in_pipe--;
+            break;
+
+          default:
+            fatal("Store to unsupported segment %d\n", this->segment);
+        }
+
+        w->outstanding_reqs++;
+        w->mem_reqs_in_pipe--;
+    }
+
+    template<typename OperationType, typename SrcDataType,
+             typename AddrRegOperandType>
+    void
+    StInst<OperationType, SrcDataType,
+           AddrRegOperandType>::generateDisassembly()
+    {
+        switch (num_src_operands) {
+          case 1:
+            this->disassembly = csprintf("%s_%s_%s %s,%s", this->opcode,
+                                         segmentNames[this->segment],
+                                         OperationType::label,
+                                         this->src.disassemble(),
+                                         this->addr.disassemble());
+            break;
+          case 2:
+            this->disassembly = csprintf("%s_%s_%s (%s,%s), %s", this->opcode,
+                                         segmentNames[this->segment],
+                                         OperationType::label,
+                                         this->src_vect[0].disassemble(),
+                                         this->src_vect[1].disassemble(),
+                                         this->addr.disassemble());
+            break;
+          case 4:
+            this->disassembly = csprintf("%s_%s_%s (%s,%s,%s,%s), %s",
+                                         this->opcode,
+                                         segmentNames[this->segment],
+                                         OperationType::label,
+                                         this->src_vect[0].disassemble(),
+                                         this->src_vect[1].disassemble(),
+                                         this->src_vect[2].disassemble(),
+                                         this->src_vect[3].disassemble(),
+                                         this->addr.disassemble());
+            break;
+          default: fatal("Bad ld register src operand, num vector operands: "
+                         "%d \n", num_src_operands);
+            break;
+        }
+    }
+
+    template<typename DataType, typename AddrRegOperandType, int NumSrcOperands,
+             bool HasDst>
+    void
+    AtomicInst<DataType, AddrRegOperandType, NumSrcOperands,
+        HasDst>::execute(GPUDynInstPtr gpuDynInst)
+    {
+        typedef typename DataType::CType CType;
+
+        Wavefront *w = gpuDynInst->wavefront();
+
+        GPUDynInstPtr m = gpuDynInst;
+
+        this->addr.calcVector(w, m->addr);
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            ((CType *)m->a_data)[lane] =
+                this->src[0].template get<CType>(w, lane);
+        }
+
+        // load second source operand for CAS
+        if (NumSrcOperands > 1) {
+            for (int lane = 0; lane < VSZ; ++lane) {
+                ((CType*)m->x_data)[lane] =
+                    this->src[1].template get<CType>(w, lane);
+            }
+        }
+
+        assert(NumSrcOperands <= 2);
+
+        m->m_op = this->opType;
+        m->m_type = DataType::memType;
+        m->v_type = DataType::vgprType;
+
+        m->exec_mask = w->execMask();
+        m->statusBitVector = 0;
+        m->equiv = 0;  // atomics don't have an equivalence class operand
+        m->n_reg = 1;
+        m->memoryOrder = getGenericMemoryOrder(this->memoryOrder);
+
+        m->scope = getGenericMemoryScope(this->memoryScope);
+
+        if (HasDst) {
+            m->dst_reg = this->dest.regIndex();
+        }
+
+        m->simdId = w->simdId;
+        m->wfSlotId = w->wfSlotId;
+        m->wfDynId = w->wfDynId;
+        m->kern_id = w->kern_id;
+        m->cu_id = w->computeUnit->cu_id;
+        m->latency.init(&w->computeUnit->shader->tick_cnt);
+
+        switch (this->segment) {
+          case Brig::BRIG_SEGMENT_GLOBAL:
+            m->s_type = SEG_GLOBAL;
+            m->latency.set(w->computeUnit->shader->ticks(64));
+            m->pipeId = GLBMEM_PIPE;
+
+            w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+            w->outstanding_reqs_wr_gm++;
+            w->wr_gm_reqs_in_pipe--;
+            w->outstanding_reqs_rd_gm++;
+            w->rd_gm_reqs_in_pipe--;
+            break;
+
+          case Brig::BRIG_SEGMENT_GROUP:
+            m->s_type = SEG_SHARED;
+            m->pipeId = LDSMEM_PIPE;
+            m->latency.set(w->computeUnit->shader->ticks(24));
+            w->computeUnit->localMemoryPipe.getLMReqFIFO().push(m);
+            w->outstanding_reqs_wr_lm++;
+            w->wr_lm_reqs_in_pipe--;
+            w->outstanding_reqs_rd_lm++;
+            w->rd_lm_reqs_in_pipe--;
+            break;
+
+          default:
+            fatal("Atomic op to unsupported segment %d\n",
+                  this->segment);
+        }
+
+        w->outstanding_reqs++;
+        w->mem_reqs_in_pipe--;
+    }
+
+    const char* atomicOpToString(Brig::BrigAtomicOperation atomicOp);
+
+    template<typename DataType, typename AddrRegOperandType, int NumSrcOperands,
+             bool HasDst>
+    void
+    AtomicInst<DataType, AddrRegOperandType, NumSrcOperands,
+               HasDst>::generateDisassembly()
+    {
+        if (HasDst) {
+            this->disassembly =
+                csprintf("%s_%s_%s_%s %s,%s", this->opcode,
+                         atomicOpToString(this->atomicOperation),
+                         segmentNames[this->segment],
+                         DataType::label, this->dest.disassemble(),
+                         this->addr.disassemble());
+        } else {
+            this->disassembly =
+                csprintf("%s_%s_%s_%s %s", this->opcode,
+                         atomicOpToString(this->atomicOperation),
+                         segmentNames[this->segment],
+                         DataType::label, this->addr.disassemble());
+        }
+
+        for (int i = 0; i < NumSrcOperands; ++i) {
+            this->disassembly += ",";
+            this->disassembly += this->src[i].disassemble();
+        }
+    }
+} // namespace HsailISA
diff --git a/src/arch/hsail/insts/pseudo_inst.cc b/src/arch/hsail/insts/pseudo_inst.cc
new file mode 100644
index 000000000..9506a80ab
--- /dev/null
+++ b/src/arch/hsail/insts/pseudo_inst.cc
@@ -0,0 +1,787 @@
+/*
+ * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Marc Orr
+ */
+
+#include <csignal>
+
+#include "arch/hsail/insts/decl.hh"
+#include "arch/hsail/insts/mem.hh"
+
+namespace HsailISA
+{
+    // Pseudo (or magic) instructions are overloaded on the hsail call
+    // instruction, because of its flexible parameter signature.
+
+    // To add a new magic instruction:
+    // 1. Add an entry to the enum.
+    // 2. Implement it in the switch statement below (Call::exec).
+    // 3. Add a utility function to hsa/hsail-gpu-compute/util/magicinst.h,
+    //    so its easy to call from an OpenCL kernel.
+
+    // This enum should be identical to the enum in
+    // hsa/hsail-gpu-compute/util/magicinst.h
+    enum
+    {
+        MAGIC_PRINT_WF_32 = 0,
+        MAGIC_PRINT_WF_64,
+        MAGIC_PRINT_LANE,
+        MAGIC_PRINT_LANE_64,
+        MAGIC_PRINT_WF_FLOAT,
+        MAGIC_SIM_BREAK,
+        MAGIC_PREF_SUM,
+        MAGIC_REDUCTION,
+        MAGIC_MASKLANE_LOWER,
+        MAGIC_MASKLANE_UPPER,
+        MAGIC_JOIN_WF_BAR,
+        MAGIC_WAIT_WF_BAR,
+        MAGIC_PANIC,
+        MAGIC_ATOMIC_NR_ADD_GLOBAL_U32_REG,
+        MAGIC_ATOMIC_NR_ADD_GROUP_U32_REG,
+        MAGIC_LOAD_GLOBAL_U32_REG,
+        MAGIC_XACT_CAS_LD,
+        MAGIC_MOST_SIG_THD,
+        MAGIC_MOST_SIG_BROADCAST,
+        MAGIC_PRINT_WFID_32,
+        MAGIC_PRINT_WFID_64
+    };
+
+    void
+    Call::execPseudoInst(Wavefront *w, GPUDynInstPtr gpuDynInst)
+    {
+        const VectorMask &mask = w->get_pred();
+
+        int op = 0;
+        bool got_op = false;
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (mask[lane]) {
+                int src_val0 = src1.get<int>(w, lane, 0);
+                if (got_op) {
+                    if (src_val0 != op) {
+                        fatal("Multiple magic instructions per PC not "
+                              "supported\n");
+                    }
+                } else {
+                    op = src_val0;
+                    got_op = true;
+                }
+            }
+        }
+
+        switch(op) {
+          case MAGIC_PRINT_WF_32:
+            MagicPrintWF32(w);
+            break;
+          case MAGIC_PRINT_WF_64:
+            MagicPrintWF64(w);
+            break;
+          case MAGIC_PRINT_LANE:
+            MagicPrintLane(w);
+            break;
+          case MAGIC_PRINT_LANE_64:
+            MagicPrintLane64(w);
+            break;
+          case MAGIC_PRINT_WF_FLOAT:
+            MagicPrintWFFloat(w);
+            break;
+          case MAGIC_SIM_BREAK:
+            MagicSimBreak(w);
+            break;
+          case MAGIC_PREF_SUM:
+            MagicPrefixSum(w);
+            break;
+          case MAGIC_REDUCTION:
+            MagicReduction(w);
+            break;
+          case MAGIC_MASKLANE_LOWER:
+            MagicMaskLower(w);
+            break;
+          case MAGIC_MASKLANE_UPPER:
+            MagicMaskUpper(w);
+            break;
+          case MAGIC_JOIN_WF_BAR:
+            MagicJoinWFBar(w);
+            break;
+          case MAGIC_WAIT_WF_BAR:
+            MagicWaitWFBar(w);
+            break;
+          case MAGIC_PANIC:
+            MagicPanic(w);
+            break;
+
+          // atomic instructions
+          case MAGIC_ATOMIC_NR_ADD_GLOBAL_U32_REG:
+            MagicAtomicNRAddGlobalU32Reg(w, gpuDynInst);
+            break;
+
+          case MAGIC_ATOMIC_NR_ADD_GROUP_U32_REG:
+            MagicAtomicNRAddGroupU32Reg(w, gpuDynInst);
+            break;
+
+          case MAGIC_LOAD_GLOBAL_U32_REG:
+            MagicLoadGlobalU32Reg(w, gpuDynInst);
+            break;
+
+          case MAGIC_XACT_CAS_LD:
+            MagicXactCasLd(w);
+            break;
+
+          case MAGIC_MOST_SIG_THD:
+            MagicMostSigThread(w);
+            break;
+
+          case MAGIC_MOST_SIG_BROADCAST:
+            MagicMostSigBroadcast(w);
+            break;
+
+          case MAGIC_PRINT_WFID_32:
+            MagicPrintWF32ID(w);
+            break;
+
+          case MAGIC_PRINT_WFID_64:
+            MagicPrintWFID64(w);
+            break;
+
+          default: fatal("unrecognized magic instruction: %d\n", op);
+        }
+    }
+
+    void
+    Call::MagicPrintLane(Wavefront *w)
+    {
+    #if TRACING_ON
+        const VectorMask &mask = w->get_pred();
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (mask[lane]) {
+                int src_val1 = src1.get<int>(w, lane, 1);
+                int src_val2 = src1.get<int>(w, lane, 2);
+                if (src_val2) {
+                    DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: 0x%x\n",
+                             disassemble(), w->computeUnit->cu_id, w->simdId,
+                             w->wfSlotId, lane, src_val1);
+                } else {
+                    DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: %d\n",
+                             disassemble(), w->computeUnit->cu_id, w->simdId,
+                             w->wfSlotId, lane, src_val1);
+                }
+            }
+        }
+    #endif
+    }
+
+    void
+    Call::MagicPrintLane64(Wavefront *w)
+    {
+    #if TRACING_ON
+        const VectorMask &mask = w->get_pred();
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (mask[lane]) {
+                int64_t src_val1 = src1.get<int64_t>(w, lane, 1);
+                int src_val2 = src1.get<int>(w, lane, 2);
+                if (src_val2) {
+                    DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: 0x%x\n",
+                             disassemble(), w->computeUnit->cu_id, w->simdId,
+                             w->wfSlotId, lane, src_val1);
+                } else {
+                    DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: %d\n",
+                             disassemble(), w->computeUnit->cu_id, w->simdId,
+                             w->wfSlotId, lane, src_val1);
+                }
+            }
+        }
+    #endif
+    }
+
+    void
+    Call::MagicPrintWF32(Wavefront *w)
+    {
+    #if TRACING_ON
+        const VectorMask &mask = w->get_pred();
+        std::string res_str;
+        res_str = csprintf("krl_prt (%s)\n", disassemble());
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (!(lane & 7)) {
+                res_str += csprintf("DB%03d: ", (int)w->wfDynId);
+            }
+
+            if (mask[lane]) {
+                int src_val1 = src1.get<int>(w, lane, 1);
+                int src_val2 = src1.get<int>(w, lane, 2);
+
+                if (src_val2) {
+                    res_str += csprintf("%08x", src_val1);
+                } else {
+                    res_str += csprintf("%08d", src_val1);
+                }
+            } else {
+                res_str += csprintf("xxxxxxxx");
+            }
+
+            if ((lane & 7) == 7) {
+                res_str += csprintf("\n");
+            } else {
+                res_str += csprintf(" ");
+            }
+        }
+
+        res_str += "\n\n";
+        DPRINTFN(res_str.c_str());
+    #endif
+    }
+
+    void
+    Call::MagicPrintWF32ID(Wavefront *w)
+    {
+    #if TRACING_ON
+        const VectorMask &mask = w->get_pred();
+        std::string res_str;
+        int src_val3 = -1;
+        res_str = csprintf("krl_prt (%s)\n", disassemble());
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (!(lane & 7)) {
+                res_str += csprintf("DB%03d: ", (int)w->wfDynId);
+            }
+
+            if (mask[lane]) {
+                int src_val1 = src1.get<int>(w, lane, 1);
+                int src_val2 = src1.get<int>(w, lane, 2);
+                src_val3 = src1.get<int>(w, lane, 3);
+
+                if (src_val2) {
+                    res_str += csprintf("%08x", src_val1);
+                } else {
+                    res_str += csprintf("%08d", src_val1);
+                }
+            } else {
+                res_str += csprintf("xxxxxxxx");
+            }
+
+            if ((lane & 7) == 7) {
+                res_str += csprintf("\n");
+            } else {
+                res_str += csprintf(" ");
+            }
+        }
+
+        res_str += "\n\n";
+        if (w->wfDynId == src_val3) {
+            DPRINTFN(res_str.c_str());
+        }
+    #endif
+    }
+
+    void
+    Call::MagicPrintWF64(Wavefront *w)
+    {
+    #if TRACING_ON
+        const VectorMask &mask = w->get_pred();
+        std::string res_str;
+        res_str = csprintf("krl_prt (%s)\n", disassemble());
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (!(lane & 3)) {
+                res_str += csprintf("DB%03d: ", (int)w->wfDynId);
+            }
+
+            if (mask[lane]) {
+                int64_t src_val1 = src1.get<int64_t>(w, lane, 1);
+                int src_val2 = src1.get<int>(w, lane, 2);
+
+                if (src_val2) {
+                    res_str += csprintf("%016x", src_val1);
+                } else {
+                    res_str += csprintf("%016d", src_val1);
+                }
+            } else {
+                res_str += csprintf("xxxxxxxxxxxxxxxx");
+            }
+
+            if ((lane & 3) == 3) {
+                res_str += csprintf("\n");
+            } else {
+                res_str += csprintf(" ");
+            }
+        }
+
+        res_str += "\n\n";
+        DPRINTFN(res_str.c_str());
+    #endif
+    }
+
+    void
+    Call::MagicPrintWFID64(Wavefront *w)
+    {
+    #if TRACING_ON
+        const VectorMask &mask = w->get_pred();
+        std::string res_str;
+        int src_val3 = -1;
+        res_str = csprintf("krl_prt (%s)\n", disassemble());
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (!(lane & 3)) {
+                res_str += csprintf("DB%03d: ", (int)w->wfDynId);
+            }
+
+            if (mask[lane]) {
+                int64_t src_val1 = src1.get<int64_t>(w, lane, 1);
+                int src_val2 = src1.get<int>(w, lane, 2);
+                src_val3 = src1.get<int>(w, lane, 3);
+
+                if (src_val2) {
+                    res_str += csprintf("%016x", src_val1);
+                } else {
+                    res_str += csprintf("%016d", src_val1);
+                }
+            } else {
+                res_str += csprintf("xxxxxxxxxxxxxxxx");
+            }
+
+            if ((lane & 3) == 3) {
+                res_str += csprintf("\n");
+            } else {
+                res_str += csprintf(" ");
+            }
+        }
+
+        res_str += "\n\n";
+        if (w->wfDynId == src_val3) {
+            DPRINTFN(res_str.c_str());
+        }
+    #endif
+    }
+
+    void
+    Call::MagicPrintWFFloat(Wavefront *w)
+    {
+    #if TRACING_ON
+        const VectorMask &mask = w->get_pred();
+        std::string res_str;
+        res_str = csprintf("krl_prt (%s)\n", disassemble());
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (!(lane & 7)) {
+                res_str += csprintf("DB%03d: ", (int)w->wfDynId);
+            }
+
+            if (mask[lane]) {
+                float src_val1 = src1.get<float>(w, lane, 1);
+                res_str += csprintf("%08f", src_val1);
+            } else {
+                res_str += csprintf("xxxxxxxx");
+            }
+
+            if ((lane & 7) == 7) {
+                res_str += csprintf("\n");
+            } else {
+                res_str += csprintf(" ");
+            }
+        }
+
+        res_str += "\n\n";
+        DPRINTFN(res_str.c_str());
+    #endif
+    }
+
+    // raises a signal that GDB will catch
+    // when done with the break, type "signal 0" in gdb to continue
+    void
+    Call::MagicSimBreak(Wavefront *w)
+    {
+        std::string res_str;
+        // print out state for this wavefront and then break
+        res_str = csprintf("Breakpoint encountered for wavefront %i\n",
+                           w->wfSlotId);
+
+        res_str += csprintf("  Kern ID: %i\n", w->kern_id);
+        res_str += csprintf("  Phase ID: %i\n", w->simdId);
+        res_str += csprintf("  Executing on CU #%i\n", w->computeUnit->cu_id);
+        res_str += csprintf("  Exec mask: ");
+
+        for (int i = VSZ - 1; i >= 0; --i) {
+            if (w->execMask(i))
+                res_str += "1";
+            else
+                res_str += "0";
+
+            if ((i & 7) == 7)
+                res_str += " ";
+        }
+
+        res_str += csprintf("(0x%016llx)\n", w->execMask().to_ullong());
+
+        res_str += "\nHelpful debugging hints:\n";
+        res_str += "   Check out w->s_reg / w->d_reg for register state\n";
+
+        res_str += "\n\n";
+        DPRINTFN(res_str.c_str());
+        fflush(stdout);
+
+        raise(SIGTRAP);
+    }
+
+    void
+    Call::MagicPrefixSum(Wavefront *w)
+    {
+        const VectorMask &mask = w->get_pred();
+        int res = 0;
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (mask[lane]) {
+                int src_val1 = src1.get<int>(w, lane, 1);
+                dest.set<int>(w, lane, res);
+                res += src_val1;
+            }
+        }
+    }
+
+    void
+    Call::MagicReduction(Wavefront *w)
+    {
+        // reduction magic instruction
+        //   The reduction instruction takes up to 64 inputs (one from
+        //   each thread in a WF) and sums them. It returns the sum to
+        //   each thread in the WF.
+        const VectorMask &mask = w->get_pred();
+        int res = 0;
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (mask[lane]) {
+                int src_val1 = src1.get<int>(w, lane, 1);
+                res += src_val1;
+            }
+        }
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (mask[lane]) {
+                dest.set<int>(w, lane, res);
+            }
+        }
+    }
+
+    void
+    Call::MagicMaskLower(Wavefront *w)
+    {
+        const VectorMask &mask = w->get_pred();
+        int res = 0;
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (mask[lane]) {
+                int src_val1 = src1.get<int>(w, lane, 1);
+
+                if (src_val1) {
+                    if (lane < (VSZ/2)) {
+                        res = res | ((uint32_t)(1) << lane);
+                    }
+                }
+            }
+        }
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (mask[lane]) {
+                dest.set<int>(w, lane, res);
+            }
+        }
+    }
+
+    void
+    Call::MagicMaskUpper(Wavefront *w)
+    {
+        const VectorMask &mask = w->get_pred();
+        int res = 0;
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (mask[lane]) {
+                int src_val1 = src1.get<int>(w, lane, 1);
+
+                if (src_val1) {
+                    if (lane >= (VSZ/2)) {
+                        res = res | ((uint32_t)(1) << (lane - (VSZ/2)));
+                    }
+                }
+            }
+        }
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (mask[lane]) {
+                dest.set<int>(w, lane, res);
+            }
+        }
+    }
+
+    void
+    Call::MagicJoinWFBar(Wavefront *w)
+    {
+        const VectorMask &mask = w->get_pred();
+        int max_cnt = 0;
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (mask[lane]) {
+                w->bar_cnt[lane]++;
+
+                if (w->bar_cnt[lane] > max_cnt) {
+                    max_cnt = w->bar_cnt[lane];
+                }
+            }
+        }
+
+        if (max_cnt > w->max_bar_cnt) {
+            w->max_bar_cnt = max_cnt;
+        }
+    }
+
+    void
+    Call::MagicWaitWFBar(Wavefront *w)
+    {
+        const VectorMask &mask = w->get_pred();
+        int max_cnt = 0;
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (mask[lane]) {
+                w->bar_cnt[lane]--;
+            }
+
+            if (w->bar_cnt[lane] > max_cnt) {
+                max_cnt = w->bar_cnt[lane];
+            }
+        }
+
+        if (max_cnt < w->max_bar_cnt) {
+            w->max_bar_cnt = max_cnt;
+        }
+
+        w->instructionBuffer.erase(w->instructionBuffer.begin() + 1,
+                                   w->instructionBuffer.end());
+        if (w->pendingFetch)
+            w->dropFetch = true;
+    }
+
+    void
+    Call::MagicPanic(Wavefront *w)
+    {
+        const VectorMask &mask = w->get_pred();
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (mask[lane]) {
+                int src_val1 = src1.get<int>(w, lane, 1);
+                panic("OpenCL Code failed assertion #%d. Triggered by lane %s",
+                      src_val1, lane);
+            }
+        }
+    }
+
+    void
+    Call::calcAddr(Wavefront *w, GPUDynInstPtr m)
+    {
+        // the address is in src1 | src2
+        for (int lane = 0; lane < VSZ; ++lane) {
+            int src_val1 = src1.get<int>(w, lane, 1);
+            int src_val2 = src1.get<int>(w, lane, 2);
+            Addr addr = (((Addr) src_val1) << 32) | ((Addr) src_val2);
+
+            m->addr[lane] = addr;
+        }
+
+    }
+
+    void
+    Call::MagicAtomicNRAddGlobalU32Reg(Wavefront *w, GPUDynInstPtr gpuDynInst)
+    {
+        GPUDynInstPtr m = gpuDynInst;
+
+        calcAddr(w, m);
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            ((int*)m->a_data)[lane] = src1.get<int>(w, lane, 3);
+        }
+
+        m->m_op = brigAtomicToMemOpType(Brig::BRIG_OPCODE_ATOMICNORET,
+                                        Brig::BRIG_ATOMIC_ADD);
+        m->m_type = U32::memType;
+        m->v_type = U32::vgprType;
+
+        m->exec_mask = w->execMask();
+        m->statusBitVector = 0;
+        m->equiv = 0;  // atomics don't have an equivalence class operand
+        m->n_reg = 1;
+        m->memoryOrder = Enums::MEMORY_ORDER_NONE;
+        m->scope = Enums::MEMORY_SCOPE_NONE;
+
+        m->simdId = w->simdId;
+        m->wfSlotId = w->wfSlotId;
+        m->wfDynId = w->wfDynId;
+        m->latency.init(&w->computeUnit->shader->tick_cnt);
+
+        m->s_type = SEG_GLOBAL;
+        m->pipeId = GLBMEM_PIPE;
+        m->latency.set(w->computeUnit->shader->ticks(64));
+        w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+        w->outstanding_reqs_wr_gm++;
+        w->wr_gm_reqs_in_pipe--;
+        w->outstanding_reqs_rd_gm++;
+        w->rd_gm_reqs_in_pipe--;
+        w->outstanding_reqs++;
+        w->mem_reqs_in_pipe--;
+    }
+
+    void
+    Call::MagicAtomicNRAddGroupU32Reg(Wavefront *w, GPUDynInstPtr gpuDynInst)
+    {
+        GPUDynInstPtr m = gpuDynInst;
+        calcAddr(w, m);
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            ((int*)m->a_data)[lane] = src1.get<int>(w, lane, 1);
+        }
+
+        m->m_op = brigAtomicToMemOpType(Brig::BRIG_OPCODE_ATOMICNORET,
+                                        Brig::BRIG_ATOMIC_ADD);
+        m->m_type = U32::memType;
+        m->v_type = U32::vgprType;
+
+        m->exec_mask = w->execMask();
+        m->statusBitVector = 0;
+        m->equiv = 0;  // atomics don't have an equivalence class operand
+        m->n_reg = 1;
+        m->memoryOrder = Enums::MEMORY_ORDER_NONE;
+        m->scope = Enums::MEMORY_SCOPE_NONE;
+
+        m->simdId = w->simdId;
+        m->wfSlotId = w->wfSlotId;
+        m->wfDynId = w->wfDynId;
+        m->latency.init(&w->computeUnit->shader->tick_cnt);
+
+        m->s_type = SEG_GLOBAL;
+        m->pipeId = GLBMEM_PIPE;
+        m->latency.set(w->computeUnit->shader->ticks(64));
+        w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+        w->outstanding_reqs_wr_gm++;
+        w->wr_gm_reqs_in_pipe--;
+        w->outstanding_reqs_rd_gm++;
+        w->rd_gm_reqs_in_pipe--;
+        w->outstanding_reqs++;
+        w->mem_reqs_in_pipe--;
+    }
+
+    void
+    Call::MagicLoadGlobalU32Reg(Wavefront *w, GPUDynInstPtr gpuDynInst)
+    {
+        GPUDynInstPtr m = gpuDynInst;
+        // calculate the address
+        calcAddr(w, m);
+
+        m->m_op = Enums::MO_LD;
+        m->m_type = U32::memType;  //MemDataType::memType;
+        m->v_type = U32::vgprType; //DestDataType::vgprType;
+
+        m->exec_mask = w->execMask();
+        m->statusBitVector = 0;
+        m->equiv = 0;
+        m->n_reg = 1;
+        m->memoryOrder = Enums::MEMORY_ORDER_NONE;
+        m->scope = Enums::MEMORY_SCOPE_NONE;
+
+        // FIXME
+        //m->dst_reg = this->dest.regIndex();
+
+        m->simdId = w->simdId;
+        m->wfSlotId = w->wfSlotId;
+        m->wfDynId = w->wfDynId;
+        m->latency.init(&w->computeUnit->shader->tick_cnt);
+
+        m->s_type = SEG_GLOBAL;
+        m->pipeId = GLBMEM_PIPE;
+        m->latency.set(w->computeUnit->shader->ticks(1));
+        w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+        w->outstanding_reqs_rd_gm++;
+        w->rd_gm_reqs_in_pipe--;
+        w->outstanding_reqs++;
+        w->mem_reqs_in_pipe--;
+    }
+
+    void
+    Call::MagicXactCasLd(Wavefront *w)
+    {
+        const VectorMask &mask = w->get_pred();
+        int src_val1 = 0;
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (mask[lane]) {
+                src_val1 = src1.get<int>(w, lane, 1);
+                break;
+            }
+        }
+
+        if (!w->computeUnit->xactCasLoadMap.count(src_val1)) {
+            w->computeUnit->xactCasLoadMap[src_val1] = ComputeUnit::waveQueue();
+            w->computeUnit->xactCasLoadMap[src_val1].waveIDQueue.clear();
+        }
+
+        w->computeUnit->xactCasLoadMap[src_val1].waveIDQueue
+            .push_back(ComputeUnit::waveIdentifier(w->simdId, w->wfSlotId));
+    }
+
+    void
+    Call::MagicMostSigThread(Wavefront *w)
+    {
+        const VectorMask &mask = w->get_pred();
+        unsigned mst = true;
+
+        for (int lane = VSZ - 1; lane >= 0; --lane) {
+            if (mask[lane]) {
+                dest.set<int>(w, lane, mst);
+                mst = false;
+            }
+        }
+    }
+
+    void
+    Call::MagicMostSigBroadcast(Wavefront *w)
+    {
+        const VectorMask &mask = w->get_pred();
+        int res = 0;
+        bool got_res = false;
+
+        for (int lane = VSZ - 1; lane >= 0; --lane) {
+            if (mask[lane]) {
+                if (!got_res) {
+                    res = src1.get<int>(w, lane, 1);
+                    got_res = true;
+                }
+                dest.set<int>(w, lane, res);
+            }
+        }
+    }
+
+} // namespace HsailISA