21 files changed, 9166 insertions, 0 deletions
diff --git a/src/arch/hsail/Brig.h b/src/arch/hsail/Brig.h
new file mode 100644
index 000000000..b260157ab
--- /dev/null
+++ b/src/arch/hsail/Brig.h
@@ -0,0 +1,67 @@
+// University of Illinois/NCSA
+// Open Source License
+//
+// Copyright (c) 2013, Advanced Micro Devices, Inc.
+// All rights reserved.
+//
+// Developed by:
+//
+//     HSA Team
+//
+//     Advanced Micro Devices, Inc
+//
+//     www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of
+// this software and associated documentation files (the "Software"), to deal with
+// the Software without restriction, including without limitation the rights to
+// use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+// of the Software, and to permit persons to whom the Software is furnished to do
+// so, subject to the following conditions:
+//
+//     * Redistributions of source code must retain the above copyright notice,
+//       this list of conditions and the following disclaimers.
+//
+//     * Redistributions in binary form must reproduce the above copyright notice,
+//       this list of conditions and the following disclaimers in the
+//       documentation and/or other materials provided with the distribution.
+//
+//     * Neither the names of the LLVM Team, University of Illinois at
+//       Urbana-Champaign, nor the names of its contributors may be used to
+//       endorse or promote products derived from this Software without specific
+//       prior written permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+// FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+// SOFTWARE.
+#ifndef INTERNAL_BRIG_H
+#define INTERNAL_BRIG_H
+
+#include <stdint.h>
+
+namespace Brig {
+#include "Brig_new.hpp"
+
+// These typedefs provide some backward compatibility with earlier versions
+// of Brig.h, reducing the number of code changes. The distinct names also
+// increase legibility by showing the code's intent.
+typedef BrigBase BrigDirective;
+typedef BrigBase BrigOperand;
+
+enum BrigMemoryFenceSegments { // for internal use only
+    //.mnemo={ s/^BRIG_MEMORY_FENCE_SEGMENT_//;lc }
+    //.mnemo_token=_EMMemoryFenceSegments
+    //.mnemo_context=EInstModifierInstFenceContext
+    BRIG_MEMORY_FENCE_SEGMENT_GLOBAL = 0,
+    BRIG_MEMORY_FENCE_SEGMENT_GROUP = 1,
+    BRIG_MEMORY_FENCE_SEGMENT_IMAGE = 2,
+    BRIG_MEMORY_FENCE_SEGMENT_LAST = 3 //.skip
+};
+
+}
+
+#endif // defined(INTERNAL_BRIG_H)
diff --git a/src/arch/hsail/Brig_new.hpp b/src/arch/hsail/Brig_new.hpp
new file mode 100644
index 000000000..60e6f4dea
--- /dev/null
+++ b/src/arch/hsail/Brig_new.hpp
@@ -0,0 +1,1587 @@
+// University of Illinois/NCSA
+// Open Source License
+//
+// Copyright (c) 2013-2015, Advanced Micro Devices, Inc.
+// All rights reserved.
+//
+// Developed by:
+//
+//     HSA Team
+//
+//     Advanced Micro Devices, Inc
+//
+//     www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of
+// this software and associated documentation files (the "Software"), to deal with
+// the Software without restriction, including without limitation the rights to
+// use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+// of the Software, and to permit persons to whom the Software is furnished to do
+// so, subject to the following conditions:
+//
+//     * Redistributions of source code must retain the above copyright notice,
+//       this list of conditions and the following disclaimers.
+//
+//     * Redistributions in binary form must reproduce the above copyright notice,
+//       this list of conditions and the following disclaimers in the
+//       documentation and/or other materials provided with the distribution.
+//
+//     * Neither the names of the LLVM Team, University of Illinois at
+//       Urbana-Champaign, nor the names of its contributors may be used to
+//       endorse or promote products derived from this Software without specific
+//       prior written permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+// FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+// SOFTWARE.
+
+//.ignore{
+
+#ifndef INCLUDED_BRIG_H
+#define INCLUDED_BRIG_H
+
+#include <stdint.h>
+
+enum BrigAuxDefs {
+  MAX_OPERANDS_NUM = 6
+};
+
+//}
+
+typedef uint32_t BrigVersion32_t;
+
+enum BrigVersion {
+
+    //.nowrap
+    //.nodump
+    //.nollvm
+
+    BRIG_VERSION_HSAIL_MAJOR = 1,
+    BRIG_VERSION_HSAIL_MINOR = 0,
+    BRIG_VERSION_BRIG_MAJOR  = 1,
+    BRIG_VERSION_BRIG_MINOR  = 0
+};
+
+typedef uint8_t BrigAlignment8_t;                           //.defValue=BRIG_ALIGNMENT_NONE
+
+typedef uint8_t BrigAllocation8_t;                          //.defValue=BRIG_ALLOCATION_NONE
+
+typedef uint8_t BrigAluModifier8_t;
+
+typedef uint8_t BrigAtomicOperation8_t;
+
+typedef uint32_t BrigCodeOffset32_t;                        //.defValue=0   //.wtype=ItemRef<Code>
+
+typedef uint8_t BrigCompareOperation8_t;
+
+typedef uint16_t BrigControlDirective16_t;
+
+typedef uint32_t BrigDataOffset32_t;
+
+typedef BrigDataOffset32_t BrigDataOffsetCodeList32_t;      //.wtype=ListRef<Code>      //.defValue=0
+
+typedef BrigDataOffset32_t BrigDataOffsetOperandList32_t;   //.wtype=ListRef<Operand>   //.defValue=0
+
+typedef BrigDataOffset32_t BrigDataOffsetString32_t;        //.wtype=StrRef             //.defValue=0
+
+typedef uint8_t BrigExecutableModifier8_t;
+
+typedef uint8_t BrigImageChannelOrder8_t;                   //.defValue=BRIG_CHANNEL_ORDER_UNKNOWN
+
+typedef uint8_t BrigImageChannelType8_t;                    //.defValue=BRIG_CHANNEL_TYPE_UNKNOWN
+
+typedef uint8_t BrigImageGeometry8_t;                       //.defValue=BRIG_GEOMETRY_UNKNOWN
+
+typedef uint8_t BrigImageQuery8_t;
+
+typedef uint16_t BrigKind16_t;
+
+typedef uint8_t BrigLinkage8_t;                             //.defValue=BRIG_LINKAGE_NONE
+
+typedef uint8_t BrigMachineModel8_t;                        //.defValue=BRIG_MACHINE_LARGE
+
+typedef uint8_t BrigMemoryModifier8_t;
+
+typedef uint8_t BrigMemoryOrder8_t;                         //.defValue=BRIG_MEMORY_ORDER_RELAXED
+
+typedef uint8_t BrigMemoryScope8_t;                         //.defValue=BRIG_MEMORY_SCOPE_SYSTEM
+
+typedef uint16_t BrigOpcode16_t;
+
+typedef uint32_t BrigOperandOffset32_t;                     //.defValue=0 //.wtype=ItemRef<Operand>
+
+typedef uint8_t BrigPack8_t;                                //.defValue=BRIG_PACK_NONE
+
+typedef uint8_t BrigProfile8_t;                             //.defValue=BRIG_PROFILE_FULL
+
+typedef uint16_t BrigRegisterKind16_t;
+
+typedef uint8_t BrigRound8_t;                               //.defValue=BRIG_ROUND_NONE
+
+typedef uint8_t BrigSamplerAddressing8_t;                   //.defValue=BRIG_ADDRESSING_CLAMP_TO_EDGE
+
+typedef uint8_t BrigSamplerCoordNormalization8_t;
+
+typedef uint8_t BrigSamplerFilter8_t;
+
+typedef uint8_t BrigSamplerQuery8_t;
+
+typedef uint32_t BrigSectionIndex32_t;
+
+typedef uint8_t BrigSegCvtModifier8_t;
+
+typedef uint8_t BrigSegment8_t;                             //.defValue=BRIG_SEGMENT_NONE
+
+typedef uint32_t BrigStringOffset32_t;                      //.defValue=0       //.wtype=StrRef
+
+typedef uint16_t BrigType16_t;
+
+typedef uint8_t BrigVariableModifier8_t;
+
+typedef uint8_t BrigWidth8_t;
+
+typedef uint32_t BrigExceptions32_t;
+
+enum BrigKind {
+
+    //.nollvm
+    //
+    //.wname={ s/^BRIG_KIND//; MACRO2Name($_) }
+    //.mnemo=$wname{ $wname }
+    //
+    //.sizeof=$wname{ "sizeof(".$structs->{"Brig".$wname}->{rawbrig}.")" }
+    //.sizeof_switch //.sizeof_proto="int size_of_brig_record(unsigned arg)" //.sizeof_default="return -1"
+    //
+    //.isBodyOnly={ "false" }
+    //.isBodyOnly_switch //.isBodyOnly_proto="bool isBodyOnly(Directive d)" //.isBodyOnly_arg="d.kind()"
+    //.isBodyOnly_default="assert(false); return false"
+    //
+    //.isToplevelOnly={ "false" }
+    //.isToplevelOnly_switch //.isToplevelOnly_proto="bool isToplevelOnly(Directive d)" //.isToplevelOnly_arg="d.kind()"
+    //.isToplevelOnly_default="assert(false); return false"
+
+    BRIG_KIND_NONE = 0x0000,                        //.skip
+
+    BRIG_KIND_DIRECTIVE_BEGIN = 0x1000,             //.skip
+    BRIG_KIND_DIRECTIVE_ARG_BLOCK_END = 0x1000,     //.isBodyOnly=true
+    BRIG_KIND_DIRECTIVE_ARG_BLOCK_START = 0x1001,   //.isBodyOnly=true
+    BRIG_KIND_DIRECTIVE_COMMENT = 0x1002,
+    BRIG_KIND_DIRECTIVE_CONTROL = 0x1003,           //.isBodyOnly=true
+    BRIG_KIND_DIRECTIVE_EXTENSION = 0x1004,         //.isToplevelOnly=true
+    BRIG_KIND_DIRECTIVE_FBARRIER = 0x1005,
+    BRIG_KIND_DIRECTIVE_FUNCTION = 0x1006,          //.isToplevelOnly=true
+    BRIG_KIND_DIRECTIVE_INDIRECT_FUNCTION = 0x1007, //.isToplevelOnly=true
+    BRIG_KIND_DIRECTIVE_KERNEL = 0x1008,            //.isToplevelOnly=true
+    BRIG_KIND_DIRECTIVE_LABEL = 0x1009,             //.isBodyOnly=true
+    BRIG_KIND_DIRECTIVE_LOC = 0x100a,
+    BRIG_KIND_DIRECTIVE_MODULE = 0x100b,            //.isToplevelOnly=true
+    BRIG_KIND_DIRECTIVE_PRAGMA = 0x100c,
+    BRIG_KIND_DIRECTIVE_SIGNATURE = 0x100d,         //.isToplevelOnly=true
+    BRIG_KIND_DIRECTIVE_VARIABLE = 0x100e,
+    BRIG_KIND_DIRECTIVE_END = 0x100f,               //.skip
+
+    BRIG_KIND_INST_BEGIN = 0x2000,                  //.skip
+    BRIG_KIND_INST_ADDR = 0x2000,
+    BRIG_KIND_INST_ATOMIC = 0x2001,
+    BRIG_KIND_INST_BASIC = 0x2002,
+    BRIG_KIND_INST_BR = 0x2003,
+    BRIG_KIND_INST_CMP = 0x2004,
+    BRIG_KIND_INST_CVT = 0x2005,
+    BRIG_KIND_INST_IMAGE = 0x2006,
+    BRIG_KIND_INST_LANE = 0x2007,
+    BRIG_KIND_INST_MEM = 0x2008,
+    BRIG_KIND_INST_MEM_FENCE = 0x2009,
+    BRIG_KIND_INST_MOD = 0x200a,
+    BRIG_KIND_INST_QUERY_IMAGE = 0x200b,
+    BRIG_KIND_INST_QUERY_SAMPLER = 0x200c,
+    BRIG_KIND_INST_QUEUE = 0x200d,
+    BRIG_KIND_INST_SEG = 0x200e,
+    BRIG_KIND_INST_SEG_CVT = 0x200f,
+    BRIG_KIND_INST_SIGNAL = 0x2010,
+    BRIG_KIND_INST_SOURCE_TYPE = 0x2011,
+    BRIG_KIND_INST_END = 0x2012,                    //.skip
+
+    BRIG_KIND_OPERAND_BEGIN = 0x3000,               //.skip
+    BRIG_KIND_OPERAND_ADDRESS = 0x3000,
+    BRIG_KIND_OPERAND_ALIGN = 0x3001,
+    BRIG_KIND_OPERAND_CODE_LIST = 0x3002,
+    BRIG_KIND_OPERAND_CODE_REF = 0x3003,
+    BRIG_KIND_OPERAND_CONSTANT_BYTES = 0x3004,
+    BRIG_KIND_OPERAND_RESERVED = 0x3005, //.skip
+    BRIG_KIND_OPERAND_CONSTANT_IMAGE = 0x3006,
+    BRIG_KIND_OPERAND_CONSTANT_OPERAND_LIST = 0x3007,
+    BRIG_KIND_OPERAND_CONSTANT_SAMPLER = 0x3008,
+    BRIG_KIND_OPERAND_OPERAND_LIST = 0x3009,
+    BRIG_KIND_OPERAND_REGISTER = 0x300a,
+    BRIG_KIND_OPERAND_STRING = 0x300b,
+    BRIG_KIND_OPERAND_WAVESIZE = 0x300c,
+    BRIG_KIND_OPERAND_END = 0x300d                  //.skip
+};
+
+enum BrigAlignment {
+
+    //.mnemo={ s/^BRIG_ALIGNMENT_//; lc }
+    //.mnemo_proto="const char* align2str(unsigned arg)"
+    //
+    //.bytes={ /(\d+)/ ? $1 : undef }
+    //.bytes_switch //.bytes_proto="unsigned align2num(unsigned arg)" //.bytes_default="assert(false); return -1"
+    //
+    //.rbytes=$bytes{ $bytes }
+    //.rbytes_switch //.rbytes_reverse //.rbytes_proto="BrigAlignment num2align(uint64_t arg)"
+    //.rbytes_default="return BRIG_ALIGNMENT_LAST"
+    //
+    //.print=$bytes{ $bytes>1 ? "_align($bytes)" : "" }
+
+    BRIG_ALIGNMENT_NONE = 0,                        //.no_mnemo
+    BRIG_ALIGNMENT_1 = 1,                           //.mnemo=""
+    BRIG_ALIGNMENT_2 = 2,
+    BRIG_ALIGNMENT_4 = 3,
+    BRIG_ALIGNMENT_8 = 4,
+    BRIG_ALIGNMENT_16 = 5,
+    BRIG_ALIGNMENT_32 = 6,
+    BRIG_ALIGNMENT_64 = 7,
+    BRIG_ALIGNMENT_128 = 8,
+    BRIG_ALIGNMENT_256 = 9,
+
+    BRIG_ALIGNMENT_LAST,                            //.skip
+    BRIG_ALIGNMENT_MAX = BRIG_ALIGNMENT_LAST - 1    //.skip
+};
+
+enum BrigAllocation {
+
+    //.mnemo={ s/^BRIG_ALLOCATION_//;lc }
+    //.mnemo_token=EAllocKind
+
+    BRIG_ALLOCATION_NONE = 0,       //.mnemo=""
+    BRIG_ALLOCATION_PROGRAM = 1,
+    BRIG_ALLOCATION_AGENT = 2,
+    BRIG_ALLOCATION_AUTOMATIC = 3
+};
+
+enum BrigAluModifierMask {
+    BRIG_ALU_FTZ = 1
+};
+
+enum BrigAtomicOperation {
+
+    //.tdcaption="Atomic Operations"
+    //
+    //.mnemo={ s/^BRIG_ATOMIC_//;lc }
+    //.mnemo_token=_EMAtomicOp
+    //.mnemo_context=EInstModifierInstAtomicContext
+    //
+    //.print=$mnemo{ "_$mnemo" }
+
+    BRIG_ATOMIC_ADD = 0,
+    BRIG_ATOMIC_AND = 1,
+    BRIG_ATOMIC_CAS = 2,
+    BRIG_ATOMIC_EXCH = 3,
+    BRIG_ATOMIC_LD = 4,
+    BRIG_ATOMIC_MAX = 5,
+    BRIG_ATOMIC_MIN = 6,
+    BRIG_ATOMIC_OR = 7,
+    BRIG_ATOMIC_ST = 8,
+    BRIG_ATOMIC_SUB = 9,
+    BRIG_ATOMIC_WRAPDEC = 10,
+    BRIG_ATOMIC_WRAPINC = 11,
+    BRIG_ATOMIC_XOR = 12,
+    BRIG_ATOMIC_WAIT_EQ = 13,
+    BRIG_ATOMIC_WAIT_NE = 14,
+    BRIG_ATOMIC_WAIT_LT = 15,
+    BRIG_ATOMIC_WAIT_GTE = 16,
+    BRIG_ATOMIC_WAITTIMEOUT_EQ = 17,
+    BRIG_ATOMIC_WAITTIMEOUT_NE = 18,
+    BRIG_ATOMIC_WAITTIMEOUT_LT = 19,
+    BRIG_ATOMIC_WAITTIMEOUT_GTE = 20
+};
+
+enum BrigCompareOperation {
+
+    //.tdcaption="Comparison Operators"
+    //
+    //.mnemo={ s/^BRIG_COMPARE_//;lc }
+    //.mnemo_token=_EMCompare
+    //
+    //.print=$mnemo{ "_$mnemo" }
+
+    BRIG_COMPARE_EQ = 0,
+    BRIG_COMPARE_NE = 1,
+    BRIG_COMPARE_LT = 2,
+    BRIG_COMPARE_LE = 3,
+    BRIG_COMPARE_GT = 4,
+    BRIG_COMPARE_GE = 5,
+    BRIG_COMPARE_EQU = 6,
+    BRIG_COMPARE_NEU = 7,
+    BRIG_COMPARE_LTU = 8,
+    BRIG_COMPARE_LEU = 9,
+    BRIG_COMPARE_GTU = 10,
+    BRIG_COMPARE_GEU = 11,
+    BRIG_COMPARE_NUM = 12,
+    BRIG_COMPARE_NAN = 13,
+    BRIG_COMPARE_SEQ = 14,
+    BRIG_COMPARE_SNE = 15,
+    BRIG_COMPARE_SLT = 16,
+    BRIG_COMPARE_SLE = 17,
+    BRIG_COMPARE_SGT = 18,
+    BRIG_COMPARE_SGE = 19,
+    BRIG_COMPARE_SGEU = 20,
+    BRIG_COMPARE_SEQU = 21,
+    BRIG_COMPARE_SNEU = 22,
+    BRIG_COMPARE_SLTU = 23,
+    BRIG_COMPARE_SLEU = 24,
+    BRIG_COMPARE_SNUM = 25,
+    BRIG_COMPARE_SNAN = 26,
+    BRIG_COMPARE_SGTU = 27
+};
+
+enum BrigControlDirective {
+
+    //.mnemo={ s/^BRIG_CONTROL_//;lc }
+    //.mnemo_token=EControl
+    //
+    //.print=$mnemo{ $mnemo }
+
+    BRIG_CONTROL_NONE = 0, //.skip
+    BRIG_CONTROL_ENABLEBREAKEXCEPTIONS = 1,
+    BRIG_CONTROL_ENABLEDETECTEXCEPTIONS = 2,
+    BRIG_CONTROL_MAXDYNAMICGROUPSIZE = 3,
+    BRIG_CONTROL_MAXFLATGRIDSIZE = 4,
+    BRIG_CONTROL_MAXFLATWORKGROUPSIZE = 5,
+    BRIG_CONTROL_REQUIREDDIM = 6,
+    BRIG_CONTROL_REQUIREDGRIDSIZE = 7,
+    BRIG_CONTROL_REQUIREDWORKGROUPSIZE = 8,
+    BRIG_CONTROL_REQUIRENOPARTIALWORKGROUPS = 9
+};
+
+enum BrigExecutableModifierMask {
+    //.nodump
+    BRIG_EXECUTABLE_DEFINITION = 1
+};
+
+enum BrigImageChannelOrder {
+
+    //.mnemo={ s/^BRIG_CHANNEL_ORDER_?//;lc }
+    //.mnemo_token=EImageOrder
+    //.mnemo_context=EImageOrderContext
+    //
+    //.print=$mnemo{ $mnemo }
+
+    BRIG_CHANNEL_ORDER_A = 0,
+    BRIG_CHANNEL_ORDER_R = 1,
+    BRIG_CHANNEL_ORDER_RX = 2,
+    BRIG_CHANNEL_ORDER_RG = 3,
+    BRIG_CHANNEL_ORDER_RGX = 4,
+    BRIG_CHANNEL_ORDER_RA = 5,
+    BRIG_CHANNEL_ORDER_RGB = 6,
+    BRIG_CHANNEL_ORDER_RGBX = 7,
+    BRIG_CHANNEL_ORDER_RGBA = 8,
+    BRIG_CHANNEL_ORDER_BGRA = 9,
+    BRIG_CHANNEL_ORDER_ARGB = 10,
+    BRIG_CHANNEL_ORDER_ABGR = 11,
+    BRIG_CHANNEL_ORDER_SRGB = 12,
+    BRIG_CHANNEL_ORDER_SRGBX = 13,
+    BRIG_CHANNEL_ORDER_SRGBA = 14,
+    BRIG_CHANNEL_ORDER_SBGRA = 15,
+    BRIG_CHANNEL_ORDER_INTENSITY = 16,
+    BRIG_CHANNEL_ORDER_LUMINANCE = 17,
+    BRIG_CHANNEL_ORDER_DEPTH = 18,
+    BRIG_CHANNEL_ORDER_DEPTH_STENCIL = 19,
+
+    // used internally
+    BRIG_CHANNEL_ORDER_UNKNOWN, //.mnemo="" // used when no order is specified
+
+    BRIG_CHANNEL_ORDER_FIRST_USER_DEFINED = 128 //.skip
+
+};
+
+enum BrigImageChannelType {
+
+    //.mnemo={ s/^BRIG_CHANNEL_TYPE_//;lc }
+    //.mnemo_token=EImageFormat
+    //
+    //.print=$mnemo{ $mnemo }
+
+    BRIG_CHANNEL_TYPE_SNORM_INT8 = 0,
+    BRIG_CHANNEL_TYPE_SNORM_INT16 = 1,
+    BRIG_CHANNEL_TYPE_UNORM_INT8 = 2,
+    BRIG_CHANNEL_TYPE_UNORM_INT16 = 3,
+    BRIG_CHANNEL_TYPE_UNORM_INT24 = 4,
+    BRIG_CHANNEL_TYPE_UNORM_SHORT_555 = 5,
+    BRIG_CHANNEL_TYPE_UNORM_SHORT_565 = 6,
+    BRIG_CHANNEL_TYPE_UNORM_INT_101010 = 7,
+    BRIG_CHANNEL_TYPE_SIGNED_INT8 = 8,
+    BRIG_CHANNEL_TYPE_SIGNED_INT16 = 9,
+    BRIG_CHANNEL_TYPE_SIGNED_INT32 = 10,
+    BRIG_CHANNEL_TYPE_UNSIGNED_INT8 = 11,
+    BRIG_CHANNEL_TYPE_UNSIGNED_INT16 = 12,
+    BRIG_CHANNEL_TYPE_UNSIGNED_INT32 = 13,
+    BRIG_CHANNEL_TYPE_HALF_FLOAT = 14,
+    BRIG_CHANNEL_TYPE_FLOAT = 15,
+
+    // used internally
+    BRIG_CHANNEL_TYPE_UNKNOWN, //.mnemo=""
+
+    BRIG_CHANNEL_TYPE_FIRST_USER_DEFINED = 128 //.skip
+};
+
+enum BrigImageGeometry {
+
+    //.tdcaption="Geometry"
+    //
+    //.mnemo={ s/^BRIG_GEOMETRY_//;lc }
+    //.mnemo_token=EImageGeometry
+    //
+    //.dim={/_([0-9]+D)(A)?/ ? $1+(defined $2?1:0) : undef}
+    //.dim_switch //.dim_proto="unsigned getBrigGeometryDim(unsigned geo)" //.dim_arg="geo"
+    //.dim_default="assert(0); return 0"
+    //
+    //.depth={/DEPTH$/?"true":"false"}
+    //.depth_switch //.depth_proto="bool isBrigGeometryDepth(unsigned geo)" //.depth_arg="geo"
+    //.depth_default="return false"
+
+    BRIG_GEOMETRY_1D = 0,
+    BRIG_GEOMETRY_2D = 1,
+    BRIG_GEOMETRY_3D = 2,
+    BRIG_GEOMETRY_1DA = 3,
+    BRIG_GEOMETRY_2DA = 4,
+    BRIG_GEOMETRY_1DB = 5,
+    BRIG_GEOMETRY_2DDEPTH = 6,
+    BRIG_GEOMETRY_2DADEPTH = 7,
+
+    // used internally
+    BRIG_GEOMETRY_UNKNOWN, //.mnemo=""
+
+    BRIG_GEOMETRY_FIRST_USER_DEFINED = 128 //.skip
+};
+
+enum BrigImageQuery {
+
+    //.mnemo={ s/^BRIG_IMAGE_QUERY_//;lc }
+    //
+    //.print=$mnemo{ $mnemo }
+
+    BRIG_IMAGE_QUERY_WIDTH = 0,
+    BRIG_IMAGE_QUERY_HEIGHT = 1,
+    BRIG_IMAGE_QUERY_DEPTH = 2,
+    BRIG_IMAGE_QUERY_ARRAY = 3,
+    BRIG_IMAGE_QUERY_CHANNELORDER = 4,
+    BRIG_IMAGE_QUERY_CHANNELTYPE = 5,
+    BRIG_IMAGE_QUERY_NUMMIPLEVELS = 6
+};
+
+enum BrigLinkage {
+
+    //.mnemo={ s/^BRIG_LINKAGE_//;s/NONE//;lc }
+
+    BRIG_LINKAGE_NONE = 0,
+    BRIG_LINKAGE_PROGRAM = 1,
+    BRIG_LINKAGE_MODULE = 2,
+    BRIG_LINKAGE_FUNCTION = 3,
+    BRIG_LINKAGE_ARG = 4
+};
+
+enum BrigMachineModel {
+
+    //.mnemo={ s/^BRIG_MACHINE_//; '$'.lc }
+    //.mnemo_token=ETargetMachine
+    //
+    //.print=$mnemo{ $mnemo }
+
+    BRIG_MACHINE_SMALL = 0,
+    BRIG_MACHINE_LARGE = 1,
+
+    BRIG_MACHINE_UNDEF = 2 //.skip
+};
+
+enum BrigMemoryModifierMask { //.tddef=0
+    BRIG_MEMORY_CONST = 1
+};
+
+enum BrigMemoryOrder {
+
+    //.mnemo={ s/^BRIG_MEMORY_ORDER_//; lc }
+    //.mnemo_token=_EMMemoryOrder
+    //
+    //.print=$mnemo{ "_$mnemo" }
+
+    BRIG_MEMORY_ORDER_NONE = 0,                 //.mnemo=""
+    BRIG_MEMORY_ORDER_RELAXED = 1,              //.mnemo=rlx
+    BRIG_MEMORY_ORDER_SC_ACQUIRE = 2,           //.mnemo=scacq
+    BRIG_MEMORY_ORDER_SC_RELEASE = 3,           //.mnemo=screl
+    BRIG_MEMORY_ORDER_SC_ACQUIRE_RELEASE = 4,   //.mnemo=scar
+
+    BRIG_MEMORY_ORDER_LAST = 5 //.skip
+};
+
+enum BrigMemoryScope {
+
+    //.mnemo={ s/^BRIG_MEMORY_SCOPE_//; lc }
+    //.mnemo_token=_EMMemoryScope
+    //
+    //.print=$mnemo{ $mnemo }
+
+    BRIG_MEMORY_SCOPE_NONE = 0,         //.mnemo=""
+    BRIG_MEMORY_SCOPE_WORKITEM = 1,     //.mnemo=""
+    BRIG_MEMORY_SCOPE_WAVEFRONT = 2,    //.mnemo=wave
+    BRIG_MEMORY_SCOPE_WORKGROUP = 3,    //.mnemo=wg
+    BRIG_MEMORY_SCOPE_AGENT = 4,        //.mnemo=agent
+    BRIG_MEMORY_SCOPE_SYSTEM = 5,       //.mnemo=system
+
+    BRIG_MEMORY_SCOPE_LAST = 6 //.skip
+};
+
+enum BrigOpcode {
+
+    //.tdcaption="Instruction Opcodes"
+    //
+    //.k={ "BASIC" }
+    //.pscode=$k{ MACRO2Name("_".$k) }
+    //.opcodeparser=$pscode{ return $pscode && "parseMnemo$pscode" }
+    //.opcodeparser_incfile=ParserUtilities
+    //.opcodeparser_switch //.opcodeparser_proto="OpcodeParser getOpcodeParser(BrigOpcode16_t arg)" //.opcodeparser_default="return parseMnemoBasic"
+    //
+    //.psopnd={undef}
+    //.opndparser=$psopnd{ return $psopnd && "&Parser::parse$psopnd" }
+    //.opndparser_incfile=ParserUtilities
+    //.opndparser_switch //.opndparser_proto="Parser::OperandParser Parser::getOperandParser(BrigOpcode16_t arg)" //.opndparser_default="return &Parser::parseOperands"
+    //
+    //.mnemo={ s/^BRIG_OPCODE_//; s/GCN([^_])/GCN_$1/; lc }
+    //.mnemo_scanner=Instructions //.mnemo_token=EInstruction
+    //.mnemo_context=EDefaultContext
+    //
+    //.has_memory_order={undef}
+    //.semsupport=$has_memory_order{ return $has_memory_order && "true" }
+    //
+    //.hasType=$k{ return ($k and $k eq "BASIC_NO_TYPE") ? "false" : undef; }
+    //.hasType_switch //.hasType_proto="bool instHasType(BrigOpcode16_t arg)" //.hasType_default="return true"
+    //
+    //.opcodevis=$pscode{ s/^BRIG_OPCODE_//; sprintf("%-47s(","vis.visitOpcode_".$_) . ($pscode =~m/^(BasicOrMod|Nop)$/? "inst" : "HSAIL_ASM::Inst". ($pscode=~m/BasicNoType/? "Basic":$pscode) ."(inst)").")" }
+    //.opcodevis_switch //.opcodevis_proto="template <typename RetType, typename Visitor> RetType visitOpcode_gen(HSAIL_ASM::Inst inst, Visitor& vis)"
+    //.opcodevis_arg="inst.opcode()" //.opcodevis_default="return RetType()"
+    //.opcodevis_incfile=ItemUtils
+    //
+    //.ftz=$k{ return ($k eq "BASIC_OR_MOD" or $k eq "CMP" or $k eq "CVT") ? "true" : undef }
+    //.ftz_incfile=ItemUtils //.ftz_switch //.ftz_proto="inline bool instSupportsFtz(BrigOpcode16_t arg)" //.ftz_default="return false"
+    //
+    //.vecOpndIndex={undef}
+    //.vecOpndIndex_switch  //.vecOpndIndex_proto="int vecOpndIndex(BrigOpcode16_t arg)" //.vecOpndIndex_default="return -1"
+    //.vecOpndIndex_incfile=ParserUtilities
+    //
+    //.numdst={undef}
+    //.numdst_switch //.numdst_proto="int instNumDstOperands(BrigOpcode16_t arg)" //.numdst_default="return 1"
+    //
+    //.print=$mnemo{ $mnemo }
+
+    BRIG_OPCODE_NOP = 0,                    //.k=NOP            //.hasType=false
+    BRIG_OPCODE_ABS = 1,                    //.k=BASIC_OR_MOD
+    BRIG_OPCODE_ADD = 2,                    //.k=BASIC_OR_MOD
+    BRIG_OPCODE_BORROW = 3,
+    BRIG_OPCODE_CARRY = 4,
+    BRIG_OPCODE_CEIL = 5,                   //.k=BASIC_OR_MOD
+    BRIG_OPCODE_COPYSIGN = 6,               //.k=BASIC_OR_MOD
+    BRIG_OPCODE_DIV = 7,                    //.k=BASIC_OR_MOD
+    BRIG_OPCODE_FLOOR = 8,                  //.k=BASIC_OR_MOD
+    BRIG_OPCODE_FMA = 9,                    //.k=BASIC_OR_MOD
+    BRIG_OPCODE_FRACT = 10,                 //.k=BASIC_OR_MOD
+    BRIG_OPCODE_MAD = 11,                   //.k=BASIC_OR_MOD
+    BRIG_OPCODE_MAX = 12,                   //.k=BASIC_OR_MOD
+    BRIG_OPCODE_MIN = 13,                   //.k=BASIC_OR_MOD
+    BRIG_OPCODE_MUL = 14,                   //.k=BASIC_OR_MOD
+    BRIG_OPCODE_MULHI = 15,                 //.k=BASIC_OR_MOD
+    BRIG_OPCODE_NEG = 16,                   //.k=BASIC_OR_MOD
+    BRIG_OPCODE_REM = 17,
+    BRIG_OPCODE_RINT = 18,                  //.k=BASIC_OR_MOD
+    BRIG_OPCODE_SQRT = 19,                  //.k=BASIC_OR_MOD
+    BRIG_OPCODE_SUB = 20,                   //.k=BASIC_OR_MOD
+    BRIG_OPCODE_TRUNC = 21,                 //.k=BASIC_OR_MOD
+    BRIG_OPCODE_MAD24 = 22,
+    BRIG_OPCODE_MAD24HI = 23,
+    BRIG_OPCODE_MUL24 = 24,
+    BRIG_OPCODE_MUL24HI = 25,
+    BRIG_OPCODE_SHL = 26,
+    BRIG_OPCODE_SHR = 27,
+    BRIG_OPCODE_AND = 28,
+    BRIG_OPCODE_NOT = 29,
+    BRIG_OPCODE_OR = 30,
+    BRIG_OPCODE_POPCOUNT = 31,              //.k=SOURCE_TYPE
+    BRIG_OPCODE_XOR = 32,
+    BRIG_OPCODE_BITEXTRACT = 33,
+    BRIG_OPCODE_BITINSERT = 34,
+    BRIG_OPCODE_BITMASK = 35,
+    BRIG_OPCODE_BITREV = 36,
+    BRIG_OPCODE_BITSELECT = 37,
+    BRIG_OPCODE_FIRSTBIT = 38,              //.k=SOURCE_TYPE
+    BRIG_OPCODE_LASTBIT = 39,               //.k=SOURCE_TYPE
+    BRIG_OPCODE_COMBINE = 40,               //.k=SOURCE_TYPE    //.vecOpndIndex=1
+    BRIG_OPCODE_EXPAND = 41,                //.k=SOURCE_TYPE    //.vecOpndIndex=0
+    BRIG_OPCODE_LDA = 42,                   //.k=ADDR
+    BRIG_OPCODE_MOV = 43,
+    BRIG_OPCODE_SHUFFLE = 44,
+    BRIG_OPCODE_UNPACKHI = 45,
+    BRIG_OPCODE_UNPACKLO = 46,
+    BRIG_OPCODE_PACK = 47,                  //.k=SOURCE_TYPE
+    BRIG_OPCODE_UNPACK = 48,                //.k=SOURCE_TYPE
+    BRIG_OPCODE_CMOV = 49,
+    BRIG_OPCODE_CLASS = 50,                 //.k=SOURCE_TYPE
+    BRIG_OPCODE_NCOS = 51,
+    BRIG_OPCODE_NEXP2 = 52,
+    BRIG_OPCODE_NFMA = 53,
+    BRIG_OPCODE_NLOG2 = 54,
+    BRIG_OPCODE_NRCP = 55,
+    BRIG_OPCODE_NRSQRT = 56,
+    BRIG_OPCODE_NSIN = 57,
+    BRIG_OPCODE_NSQRT = 58,
+    BRIG_OPCODE_BITALIGN = 59,
+    BRIG_OPCODE_BYTEALIGN = 60,
+    BRIG_OPCODE_PACKCVT = 61,               //.k=SOURCE_TYPE
+    BRIG_OPCODE_UNPACKCVT = 62,             //.k=SOURCE_TYPE
+    BRIG_OPCODE_LERP = 63,
+    BRIG_OPCODE_SAD = 64,                   //.k=SOURCE_TYPE
+    BRIG_OPCODE_SADHI = 65,                 //.k=SOURCE_TYPE
+    BRIG_OPCODE_SEGMENTP = 66,              //.k=SEG_CVT
+    BRIG_OPCODE_FTOS = 67,                  //.k=SEG_CVT
+    BRIG_OPCODE_STOF = 68,                  //.k=SEG_CVT
+    BRIG_OPCODE_CMP = 69,                   //.k=CMP
+    BRIG_OPCODE_CVT = 70,                   //.k=CVT
+    BRIG_OPCODE_LD = 71,                    //.k=MEM            //.has_memory_order //.vecOpndIndex=0
+    BRIG_OPCODE_ST = 72,                    //.k=MEM            //.has_memory_order //.vecOpndIndex=0 //.numdst=0
+    BRIG_OPCODE_ATOMIC = 73,                //.k=ATOMIC
+    BRIG_OPCODE_ATOMICNORET = 74,           //.k=ATOMIC         //.numdst=0
+    BRIG_OPCODE_SIGNAL = 75,                //.k=SIGNAL
+    BRIG_OPCODE_SIGNALNORET = 76,           //.k=SIGNAL         //.numdst=0
+    BRIG_OPCODE_MEMFENCE = 77,              //.k=MEM_FENCE      //.numdst=0
+    BRIG_OPCODE_RDIMAGE = 78,               //.k=IMAGE          //.vecOpndIndex=0
+    BRIG_OPCODE_LDIMAGE = 79,               //.k=IMAGE          //.vecOpndIndex=0
+    BRIG_OPCODE_STIMAGE = 80,               //.k=IMAGE          //.vecOpndIndex=0 //.numdst=0
+    BRIG_OPCODE_IMAGEFENCE = 81,            //.k=BASIC_NO_TYPE
+    BRIG_OPCODE_QUERYIMAGE = 82,            //.k=QUERY_IMAGE
+    BRIG_OPCODE_QUERYSAMPLER = 83,          //.k=QUERY_SAMPLER
+    BRIG_OPCODE_CBR = 84,                   //.k=BR             //.numdst=0
+    BRIG_OPCODE_BR = 85,                    //.k=BR             //.numdst=0     //.hasType=false
+    BRIG_OPCODE_SBR = 86,                   //.k=BR             //.numdst=0     //.psopnd=SbrOperands
+    BRIG_OPCODE_BARRIER = 87,               //.k=BR             //.numdst=0     //.hasType=false
+    BRIG_OPCODE_WAVEBARRIER = 88,           //.k=BR             //.numdst=0     //.hasType=false
+    BRIG_OPCODE_ARRIVEFBAR = 89,            //.k=BR             //.numdst=0     //.hasType=false
+    BRIG_OPCODE_INITFBAR = 90,              //.k=BASIC_NO_TYPE  //.numdst=0     //.hasType=false
+    BRIG_OPCODE_JOINFBAR = 91,              //.k=BR             //.numdst=0     //.hasType=false
+    BRIG_OPCODE_LEAVEFBAR = 92,             //.k=BR             //.numdst=0     //.hasType=false
+    BRIG_OPCODE_RELEASEFBAR = 93,           //.k=BASIC_NO_TYPE  //.numdst=0
+    BRIG_OPCODE_WAITFBAR = 94,              //.k=BR             //.numdst=0     //.hasType=false
+    BRIG_OPCODE_LDF = 95,
+    BRIG_OPCODE_ACTIVELANECOUNT = 96,       //.k=LANE
+    BRIG_OPCODE_ACTIVELANEID = 97,          //.k=LANE
+    BRIG_OPCODE_ACTIVELANEMASK = 98,        //.k=LANE           //.vecOpndIndex=0
+    BRIG_OPCODE_ACTIVELANEPERMUTE = 99,     //.k=LANE
+    BRIG_OPCODE_CALL = 100,                 //.k=BR             //.psopnd=CallOperands //.numdst=0 //.hasType=false
+    BRIG_OPCODE_SCALL = 101,                //.k=BR             //.psopnd=CallOperands //.numdst=0
+    BRIG_OPCODE_ICALL = 102,                //.k=BR             //.psopnd=CallOperands //.numdst=0
+    BRIG_OPCODE_RET = 103,                  //.k=BASIC_NO_TYPE
+    BRIG_OPCODE_ALLOCA = 104,               //.k=MEM
+    BRIG_OPCODE_CURRENTWORKGROUPSIZE = 105,
+    BRIG_OPCODE_CURRENTWORKITEMFLATID = 106,
+    BRIG_OPCODE_DIM = 107,
+    BRIG_OPCODE_GRIDGROUPS = 108,
+    BRIG_OPCODE_GRIDSIZE = 109,
+    BRIG_OPCODE_PACKETCOMPLETIONSIG = 110,
+    BRIG_OPCODE_PACKETID = 111,
+    BRIG_OPCODE_WORKGROUPID = 112,
+    BRIG_OPCODE_WORKGROUPSIZE = 113,
+    BRIG_OPCODE_WORKITEMABSID = 114,
+    BRIG_OPCODE_WORKITEMFLATABSID = 115,
+    BRIG_OPCODE_WORKITEMFLATID = 116,
+    BRIG_OPCODE_WORKITEMID = 117,
+    BRIG_OPCODE_CLEARDETECTEXCEPT = 118,    //.numdst=0
+    BRIG_OPCODE_GETDETECTEXCEPT = 119,
+    BRIG_OPCODE_SETDETECTEXCEPT = 120,      //.numdst=0
+    BRIG_OPCODE_ADDQUEUEWRITEINDEX = 121,   //.k=QUEUE
+    BRIG_OPCODE_CASQUEUEWRITEINDEX = 122,   //.k=QUEUE
+    BRIG_OPCODE_LDQUEUEREADINDEX = 123,     //.k=QUEUE
+    BRIG_OPCODE_LDQUEUEWRITEINDEX = 124,    //.k=QUEUE
+    BRIG_OPCODE_STQUEUEREADINDEX = 125,     //.k=QUEUE      //.numdst=0
+    BRIG_OPCODE_STQUEUEWRITEINDEX = 126,    //.k=QUEUE      //.numdst=0
+    BRIG_OPCODE_CLOCK = 127,
+    BRIG_OPCODE_CUID = 128,
+    BRIG_OPCODE_DEBUGTRAP = 129,            //.numdst=0
+    BRIG_OPCODE_GROUPBASEPTR = 130,
+    BRIG_OPCODE_KERNARGBASEPTR = 131,
+    BRIG_OPCODE_LANEID = 132,
+    BRIG_OPCODE_MAXCUID = 133,
+    BRIG_OPCODE_MAXWAVEID = 134,
+    BRIG_OPCODE_NULLPTR = 135,              //.k=SEG
+    BRIG_OPCODE_WAVEID = 136,
+    BRIG_OPCODE_FIRST_USER_DEFINED = 32768, //.skip
+
+    BRIG_OPCODE_GCNMADU = (1u << 15) | 0,           //.k=BASIC_NO_TYPE
+    BRIG_OPCODE_GCNMADS = (1u << 15) | 1,           //.k=BASIC_NO_TYPE
+    BRIG_OPCODE_GCNMAX3 = (1u << 15) | 2,
+    BRIG_OPCODE_GCNMIN3 = (1u << 15) | 3,
+    BRIG_OPCODE_GCNMED3 = (1u << 15) | 4,
+    BRIG_OPCODE_GCNFLDEXP = (1u << 15) | 5,         //.k=BASIC_OR_MOD
+    BRIG_OPCODE_GCNFREXP_EXP = (1u << 15) | 6,      //.k=BASIC_OR_MOD
+    BRIG_OPCODE_GCNFREXP_MANT = (1u << 15) | 7,     //.k=BASIC_OR_MOD
+    BRIG_OPCODE_GCNTRIG_PREOP = (1u << 15) | 8,     //.k=BASIC_OR_MOD
+    BRIG_OPCODE_GCNBFM = (1u << 15) | 9,
+    BRIG_OPCODE_GCNLD = (1u << 15) | 10,            //.k=MEM            //.has_memory_order //.vecOpndIndex=0
+    BRIG_OPCODE_GCNST = (1u << 15) | 11,            //.k=MEM            //.has_memory_order //.vecOpndIndex=0
+    BRIG_OPCODE_GCNATOMIC = (1u << 15) | 12,        //.k=ATOMIC
+    BRIG_OPCODE_GCNATOMICNORET = (1u << 15) | 13,   //.k=ATOMIC         //.mnemo=gcn_atomicNoRet
+    BRIG_OPCODE_GCNSLEEP = (1u << 15) | 14,
+    BRIG_OPCODE_GCNPRIORITY = (1u << 15) | 15,
+    BRIG_OPCODE_GCNREGIONALLOC = (1u << 15) | 16,   //.k=BASIC_NO_TYPE //.mnemo=gcn_region_alloc
+    BRIG_OPCODE_GCNMSAD = (1u << 15) | 17,
+    BRIG_OPCODE_GCNQSAD = (1u << 15) | 18,
+    BRIG_OPCODE_GCNMQSAD = (1u << 15) | 19,
+    BRIG_OPCODE_GCNMQSAD4 = (1u << 15) | 20,        //.k=BASIC_NO_TYPE
+    BRIG_OPCODE_GCNSADW = (1u << 15) | 21,
+    BRIG_OPCODE_GCNSADD = (1u << 15) | 22,
+    BRIG_OPCODE_GCNCONSUME = (1u << 15) | 23,       //.k=ADDR           //.mnemo=gcn_atomic_consume
+    BRIG_OPCODE_GCNAPPEND = (1u << 15) | 24,        //.k=ADDR           //.mnemo=gcn_atomic_append
+    BRIG_OPCODE_GCNB4XCHG = (1u << 15) | 25,        //.mnemo=gcn_b4xchg
+    BRIG_OPCODE_GCNB32XCHG = (1u << 15) | 26,       //.mnemo=gcn_b32xchg
+    BRIG_OPCODE_GCNMAX = (1u << 15) | 27,
+    BRIG_OPCODE_GCNMIN = (1u << 15) | 28,
+    BRIG_OPCODE_GCNDIVRELAXED = (1u << 15) | 29,    //.k=BASIC_OR_MOD
+    BRIG_OPCODE_GCNDIVRELAXEDNARROW = (1u << 15) | 30,
+
+    BRIG_OPCODE_AMDRDIMAGELOD  = (1u << 15) | 31,    //.k=IMAGE //.mnemo=amd_rdimagelod  //.vecOpndIndex=0
+    BRIG_OPCODE_AMDRDIMAGEGRAD = (1u << 15) | 32,    //.k=IMAGE //.mnemo=amd_rdimagegrad //.vecOpndIndex=0
+    BRIG_OPCODE_AMDLDIMAGEMIP  = (1u << 15) | 33,    //.k=IMAGE //.mnemo=amd_ldimagemip //.vecOpndIndex=0
+    BRIG_OPCODE_AMDSTIMAGEMIP  = (1u << 15) | 34,    //.k=IMAGE //.mnemo=amd_stimagemip //.vecOpndIndex=0 //.numdst=0
+    BRIG_OPCODE_AMDQUERYIMAGE  = (1u << 15) | 35     //.k=QUERY_IMAGE //.mnemo=amd_queryimage
+};
+
+enum BrigPack {
+
+    //.tdcaption="Packing"
+    //
+    //.mnemo={ s/^BRIG_PACK_//;s/SAT$/_sat/;lc }
+    //.mnemo_token=_EMPacking
+    //
+    //.print=$mnemo{ "_$mnemo" }
+
+    BRIG_PACK_NONE = 0, //.mnemo=""
+    BRIG_PACK_PP = 1,
+    BRIG_PACK_PS = 2,
+    BRIG_PACK_SP = 3,
+    BRIG_PACK_SS = 4,
+    BRIG_PACK_S = 5,
+    BRIG_PACK_P = 6,
+    BRIG_PACK_PPSAT = 7,
+    BRIG_PACK_PSSAT = 8,
+    BRIG_PACK_SPSAT = 9,
+    BRIG_PACK_SSSAT = 10,
+    BRIG_PACK_SSAT = 11,
+    BRIG_PACK_PSAT = 12
+};
+
+enum BrigProfile {
+
+    //.mnemo={ s/^BRIG_PROFILE_//;'$'.lc }
+    //.mnemo_token=ETargetProfile
+    //
+    //.print=$mnemo{ $mnemo }
+
+    BRIG_PROFILE_BASE = 0,
+    BRIG_PROFILE_FULL = 1,
+
+    BRIG_PROFILE_UNDEF = 2 //.skip
+};
+
+enum BrigRegisterKind {
+
+    //.mnemo={ s/^BRIG_REGISTER_KIND_//;'$'.lc(substr($_,0,1)) }
+    //
+    //.bits={ }
+    //.bits_switch //.bits_proto="unsigned getRegBits(BrigRegisterKind16_t arg)" //.bits_default="return (unsigned)-1"
+    //
+    //.nollvm
+
+    BRIG_REGISTER_KIND_CONTROL = 0, //.bits=1
+    BRIG_REGISTER_KIND_SINGLE = 1,  //.bits=32
+    BRIG_REGISTER_KIND_DOUBLE = 2,  //.bits=64
+    BRIG_REGISTER_KIND_QUAD = 3     //.bits=128
+};
+
+enum BrigRound {
+
+    //.mnemo={}
+    //.mnemo_fn=round2str //.mnemo_token=_EMRound
+    //
+    //.sat={/_SAT$/? "true" : "false"}
+    //.sat_switch //.sat_proto="bool isSatRounding(unsigned rounding)" //.sat_arg="rounding"
+    //.sat_default="return false"
+    //
+    //.sig={/_SIGNALING_/? "true" : "false"}
+    //.sig_switch //.sig_proto="bool isSignalingRounding(unsigned rounding)" //.sig_arg="rounding"
+    //.sig_default="return false"
+    //
+    //.int={/_INTEGER_/? "true" : "false"}
+    //.int_switch //.int_proto="bool isIntRounding(unsigned rounding)" //.int_arg="rounding"
+    //.int_default="return false"
+    //
+    //.flt={/_FLOAT_/? "true" : "false"}
+    //.flt_switch //.flt_proto="bool isFloatRounding(unsigned rounding)" //.flt_arg="rounding"
+    //.flt_default="return false"
+    //
+    //.print=$mnemo{ "_$mnemo" }
+
+    BRIG_ROUND_NONE = 0,                                    //.no_mnemo
+    BRIG_ROUND_FLOAT_DEFAULT = 1,                           //.no_mnemo
+    BRIG_ROUND_FLOAT_NEAR_EVEN = 2,                         //.mnemo=near
+    BRIG_ROUND_FLOAT_ZERO = 3,                              //.mnemo=zero
+    BRIG_ROUND_FLOAT_PLUS_INFINITY = 4,                     //.mnemo=up
+    BRIG_ROUND_FLOAT_MINUS_INFINITY = 5,                    //.mnemo=down
+    BRIG_ROUND_INTEGER_NEAR_EVEN = 6,                       //.mnemo=neari
+    BRIG_ROUND_INTEGER_ZERO = 7,                            //.mnemo=zeroi
+    BRIG_ROUND_INTEGER_PLUS_INFINITY = 8,                   //.mnemo=upi
+    BRIG_ROUND_INTEGER_MINUS_INFINITY = 9,                  //.mnemo=downi
+    BRIG_ROUND_INTEGER_NEAR_EVEN_SAT = 10,                  //.mnemo=neari_sat
+    BRIG_ROUND_INTEGER_ZERO_SAT = 11,                       //.mnemo=zeroi_sat
+    BRIG_ROUND_INTEGER_PLUS_INFINITY_SAT = 12,              //.mnemo=upi_sat
+    BRIG_ROUND_INTEGER_MINUS_INFINITY_SAT = 13,             //.mnemo=downi_sat
+    BRIG_ROUND_INTEGER_SIGNALING_NEAR_EVEN = 14,            //.mnemo=sneari
+    BRIG_ROUND_INTEGER_SIGNALING_ZERO = 15,                 //.mnemo=szeroi
+    BRIG_ROUND_INTEGER_SIGNALING_PLUS_INFINITY = 16,        //.mnemo=supi
+    BRIG_ROUND_INTEGER_SIGNALING_MINUS_INFINITY = 17,       //.mnemo=sdowni
+    BRIG_ROUND_INTEGER_SIGNALING_NEAR_EVEN_SAT = 18,        //.mnemo=sneari_sat
+    BRIG_ROUND_INTEGER_SIGNALING_ZERO_SAT = 19,             //.mnemo=szeroi_sat
+    BRIG_ROUND_INTEGER_SIGNALING_PLUS_INFINITY_SAT = 20,    //.mnemo=supi_sat
+    BRIG_ROUND_INTEGER_SIGNALING_MINUS_INFINITY_SAT = 21    //.mnemo=sdowni_sat
+};
+
+enum BrigSamplerAddressing {
+
+    //.mnemo={ s/^BRIG_ADDRESSING_//;lc }
+    //.mnemo_token=ESamplerAddressingMode
+
+    BRIG_ADDRESSING_UNDEFINED = 0,
+    BRIG_ADDRESSING_CLAMP_TO_EDGE = 1,
+    BRIG_ADDRESSING_CLAMP_TO_BORDER = 2,
+    BRIG_ADDRESSING_REPEAT = 3,
+    BRIG_ADDRESSING_MIRRORED_REPEAT = 4,
+
+    BRIG_ADDRESSING_FIRST_USER_DEFINED = 128 //.skip
+};
+
+enum BrigSamplerCoordNormalization {
+
+    //.mnemo={ s/^BRIG_COORD_//;lc }
+    //.mnemo_token=ESamplerCoord
+    //
+    //.print=$mnemo{ $mnemo }
+
+    BRIG_COORD_UNNORMALIZED = 0,
+    BRIG_COORD_NORMALIZED = 1
+};
+
+enum BrigSamplerFilter {
+
+    //.mnemo={ s/^BRIG_FILTER_//;lc }
+    //
+    //.print=$mnemo{ $mnemo }
+
+    BRIG_FILTER_NEAREST = 0,
+    BRIG_FILTER_LINEAR = 1,
+
+    BRIG_FILTER_FIRST_USER_DEFINED = 128 //.skip
+};
+
+enum BrigSamplerQuery {
+
+    //.mnemo={ s/^BRIG_SAMPLER_QUERY_//;lc }
+    //.mnemo_token=_EMSamplerQuery
+    //
+    //.print=$mnemo{ $mnemo }
+
+    BRIG_SAMPLER_QUERY_ADDRESSING = 0,
+    BRIG_SAMPLER_QUERY_COORD = 1,
+    BRIG_SAMPLER_QUERY_FILTER = 2
+};
+
+enum BrigSectionIndex {
+
+    //.nollvm
+    //
+    //.mnemo={ s/^BRIG_SECTION_INDEX_/HSA_/;lc }
+
+    BRIG_SECTION_INDEX_DATA = 0,
+    BRIG_SECTION_INDEX_CODE = 1,
+    BRIG_SECTION_INDEX_OPERAND = 2,
+    BRIG_SECTION_INDEX_BEGIN_IMPLEMENTATION_DEFINED = 3,
+
+    // used internally
+    BRIG_SECTION_INDEX_IMPLEMENTATION_DEFINED = BRIG_SECTION_INDEX_BEGIN_IMPLEMENTATION_DEFINED //.skip
+};
+
+enum BrigSegCvtModifierMask {
+    BRIG_SEG_CVT_NONULL = 1         //.mnemo="nonull" //.print="_nonull"
+};
+
+enum BrigSegment {
+
+    //.mnemo={ s/^BRIG_SEGMENT_//;lc}
+    //.mnemo_token=_EMSegment
+    //.mnemo_context=EInstModifierContext
+    //
+    //.print=$mnemo{ $mnemo ? "_$mnemo" : "" }
+
+    BRIG_SEGMENT_NONE = 0, //.mnemo=""
+    BRIG_SEGMENT_FLAT = 1, //.mnemo=""
+    BRIG_SEGMENT_GLOBAL = 2,
+    BRIG_SEGMENT_READONLY = 3,
+    BRIG_SEGMENT_KERNARG = 4,
+    BRIG_SEGMENT_GROUP = 5,
+    BRIG_SEGMENT_PRIVATE = 6,
+    BRIG_SEGMENT_SPILL = 7,
+    BRIG_SEGMENT_ARG = 8,
+
+    BRIG_SEGMENT_FIRST_USER_DEFINED = 128, //.skip
+
+    BRIG_SEGMENT_AMD_GCN = 9, //.mnemo="region"
+};
+
+enum BrigPackedTypeBits {
+
+    //.nodump
+    //
+    //.nollvm
+
+    BRIG_TYPE_BASE_SIZE  = 5,
+    BRIG_TYPE_PACK_SIZE  = 2,
+    BRIG_TYPE_ARRAY_SIZE = 1,
+
+    BRIG_TYPE_BASE_SHIFT  = 0,
+    BRIG_TYPE_PACK_SHIFT  = BRIG_TYPE_BASE_SHIFT + BRIG_TYPE_BASE_SIZE,
+    BRIG_TYPE_ARRAY_SHIFT = BRIG_TYPE_PACK_SHIFT + BRIG_TYPE_PACK_SIZE,
+
+    BRIG_TYPE_BASE_MASK  = ((1 << BRIG_TYPE_BASE_SIZE)  - 1) << BRIG_TYPE_BASE_SHIFT,
+    BRIG_TYPE_PACK_MASK  = ((1 << BRIG_TYPE_PACK_SIZE)  - 1) << BRIG_TYPE_PACK_SHIFT,
+    BRIG_TYPE_ARRAY_MASK = ((1 << BRIG_TYPE_ARRAY_SIZE) - 1) << BRIG_TYPE_ARRAY_SHIFT,
+
+    BRIG_TYPE_PACK_NONE = 0 << BRIG_TYPE_PACK_SHIFT,
+    BRIG_TYPE_PACK_32   = 1 << BRIG_TYPE_PACK_SHIFT,
+    BRIG_TYPE_PACK_64   = 2 << BRIG_TYPE_PACK_SHIFT,
+    BRIG_TYPE_PACK_128  = 3 << BRIG_TYPE_PACK_SHIFT,
+
+    BRIG_TYPE_ARRAY     = 1 << BRIG_TYPE_ARRAY_SHIFT
+};
+
+enum BrigType {
+
+    //.numBits={ /ARRAY$/ ? undef : /([0-9]+)X([0-9]+)/ ? $1*$2 : /([0-9]+)/ ? $1 : undef }
+    //.numBits_switch //.numBits_proto="unsigned getBrigTypeNumBits(unsigned arg)" //.numBits_default="assert(0); return 0"
+    //.numBytes=$numBits{ $numBits > 1 ? $numBits/8 : undef }
+    //.numBytes_switch //.numBytes_proto="unsigned getBrigTypeNumBytes(unsigned arg)" //.numBytes_default="assert(0); return 0"
+    //
+    //.mnemo={ s/^BRIG_TYPE_//;lc }
+    //.mnemo_token=_EMType
+    //
+    //.array={/ARRAY$/?"true":"false"}
+    //.array_switch //.array_proto="bool isArrayType(unsigned type)" //.array_arg="type"
+    //.array_default="return false"
+    //
+    //.a2e={/(.*)_ARRAY$/? $1 : "BRIG_TYPE_NONE"}
+    //.a2e_switch //.a2e_proto="unsigned arrayType2elementType(unsigned type)" //.a2e_arg="type"
+    //.a2e_default="return BRIG_TYPE_NONE"
+    //
+    //.e2a={/_ARRAY$/? "BRIG_TYPE_NONE" : /_NONE$/ ? "BRIG_TYPE_NONE" : /_B1$/ ? "BRIG_TYPE_NONE" : $_ . "_ARRAY"}
+    //.e2a_switch //.e2a_proto="unsigned elementType2arrayType(unsigned type)" //.e2a_arg="type"
+    //.e2a_default="return BRIG_TYPE_NONE"
+    //
+    //.t2s={s/^BRIG_TYPE_//;lc s/_ARRAY$/[]/;lc}
+    //.t2s_switch //.t2s_proto="const char* type2name(unsigned type)" //.t2s_arg="type"
+    //.t2s_default="return NULL"
+    //
+    //.dispatch_switch //.dispatch_incfile=TemplateUtilities
+    //.dispatch_proto="template<typename RetType, typename Visitor>\nRetType dispatchByType_gen(unsigned type, Visitor& v)"
+    //.dispatch={ /ARRAY$/ ? "v.visitNone(type)" : /^BRIG_TYPE_([BUSF]|SIG)[0-9]+/ ? "v.template visit< BrigTypeTraits<$_> >()" : "v.visitNone(type)" }
+    //.dispatch_arg="type" //.dispatch_default="return v.visitNone(type)"
+    //
+    //- .tdname=BrigType
+    //
+    //.print=$mnemo{ "_$mnemo" }
+
+    BRIG_TYPE_NONE  = 0,  //.mnemo=""       //.print=""
+    BRIG_TYPE_U8    = 1,  //.ctype=uint8_t
+    BRIG_TYPE_U16   = 2,  //.ctype=uint16_t
+    BRIG_TYPE_U32   = 3,  //.ctype=uint32_t
+    BRIG_TYPE_U64   = 4,  //.ctype=uint64_t
+    BRIG_TYPE_S8    = 5,  //.ctype=int8_t
+    BRIG_TYPE_S16   = 6,  //.ctype=int16_t
+    BRIG_TYPE_S32   = 7,  //.ctype=int32_t
+    BRIG_TYPE_S64   = 8,  //.ctype=int64_t
+    BRIG_TYPE_F16   = 9,  //.ctype=f16_t
+    BRIG_TYPE_F32   = 10, //.ctype=float
+    BRIG_TYPE_F64   = 11, //.ctype=double
+    BRIG_TYPE_B1    = 12, //.ctype=bool     //.numBytes=1
+    BRIG_TYPE_B8    = 13, //.ctype=uint8_t
+    BRIG_TYPE_B16   = 14, //.ctype=uint16_t
+    BRIG_TYPE_B32   = 15, //.ctype=uint32_t
+    BRIG_TYPE_B64   = 16, //.ctype=uint64_t
+    BRIG_TYPE_B128  = 17, //.ctype=b128_t
+    BRIG_TYPE_SAMP  = 18, //.mnemo=samp     //.numBits=64
+    BRIG_TYPE_ROIMG = 19, //.mnemo=roimg    //.numBits=64
+    BRIG_TYPE_WOIMG = 20, //.mnemo=woimg    //.numBits=64
+    BRIG_TYPE_RWIMG = 21, //.mnemo=rwimg    //.numBits=64
+    BRIG_TYPE_SIG32 = 22, //.mnemo=sig32    //.numBits=64
+    BRIG_TYPE_SIG64 = 23, //.mnemo=sig64    //.numBits=64
+
+    BRIG_TYPE_U8X4  = BRIG_TYPE_U8  | BRIG_TYPE_PACK_32,  //.ctype=uint8_t
+    BRIG_TYPE_U8X8  = BRIG_TYPE_U8  | BRIG_TYPE_PACK_64,  //.ctype=uint8_t
+    BRIG_TYPE_U8X16 = BRIG_TYPE_U8  | BRIG_TYPE_PACK_128, //.ctype=uint8_t
+    BRIG_TYPE_U16X2 = BRIG_TYPE_U16 | BRIG_TYPE_PACK_32,  //.ctype=uint16_t
+    BRIG_TYPE_U16X4 = BRIG_TYPE_U16 | BRIG_TYPE_PACK_64,  //.ctype=uint16_t
+    BRIG_TYPE_U16X8 = BRIG_TYPE_U16 | BRIG_TYPE_PACK_128, //.ctype=uint16_t
+    BRIG_TYPE_U32X2 = BRIG_TYPE_U32 | BRIG_TYPE_PACK_64,  //.ctype=uint32_t
+    BRIG_TYPE_U32X4 = BRIG_TYPE_U32 | BRIG_TYPE_PACK_128, //.ctype=uint32_t
+    BRIG_TYPE_U64X2 = BRIG_TYPE_U64 | BRIG_TYPE_PACK_128, //.ctype=uint64_t
+    BRIG_TYPE_S8X4  = BRIG_TYPE_S8  | BRIG_TYPE_PACK_32,  //.ctype=int8_t
+    BRIG_TYPE_S8X8  = BRIG_TYPE_S8  | BRIG_TYPE_PACK_64,  //.ctype=int8_t
+    BRIG_TYPE_S8X16 = BRIG_TYPE_S8  | BRIG_TYPE_PACK_128, //.ctype=int8_t
+    BRIG_TYPE_S16X2 = BRIG_TYPE_S16 | BRIG_TYPE_PACK_32,  //.ctype=int16_t
+    BRIG_TYPE_S16X4 = BRIG_TYPE_S16 | BRIG_TYPE_PACK_64,  //.ctype=int16_t
+    BRIG_TYPE_S16X8 = BRIG_TYPE_S16 | BRIG_TYPE_PACK_128, //.ctype=int16_t
+    BRIG_TYPE_S32X2 = BRIG_TYPE_S32 | BRIG_TYPE_PACK_64,  //.ctype=int32_t
+    BRIG_TYPE_S32X4 = BRIG_TYPE_S32 | BRIG_TYPE_PACK_128, //.ctype=int32_t
+    BRIG_TYPE_S64X2 = BRIG_TYPE_S64 | BRIG_TYPE_PACK_128, //.ctype=int64_t
+    BRIG_TYPE_F16X2 = BRIG_TYPE_F16 | BRIG_TYPE_PACK_32,  //.ctype=f16_t
+    BRIG_TYPE_F16X4 = BRIG_TYPE_F16 | BRIG_TYPE_PACK_64,  //.ctype=f16_t
+    BRIG_TYPE_F16X8 = BRIG_TYPE_F16 | BRIG_TYPE_PACK_128, //.ctype=f16_t
+    BRIG_TYPE_F32X2 = BRIG_TYPE_F32 | BRIG_TYPE_PACK_64,  //.ctype=float
+    BRIG_TYPE_F32X4 = BRIG_TYPE_F32 | BRIG_TYPE_PACK_128, //.ctype=float
+    BRIG_TYPE_F64X2 = BRIG_TYPE_F64 | BRIG_TYPE_PACK_128, //.ctype=double
+
+    BRIG_TYPE_U8_ARRAY    = BRIG_TYPE_U8    | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_U16_ARRAY   = BRIG_TYPE_U16   | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_U32_ARRAY   = BRIG_TYPE_U32   | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_U64_ARRAY   = BRIG_TYPE_U64   | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_S8_ARRAY    = BRIG_TYPE_S8    | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_S16_ARRAY   = BRIG_TYPE_S16   | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_S32_ARRAY   = BRIG_TYPE_S32   | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_S64_ARRAY   = BRIG_TYPE_S64   | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_F16_ARRAY   = BRIG_TYPE_F16   | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_F32_ARRAY   = BRIG_TYPE_F32   | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_F64_ARRAY   = BRIG_TYPE_F64   | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_B8_ARRAY    = BRIG_TYPE_B8    | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_B16_ARRAY   = BRIG_TYPE_B16   | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_B32_ARRAY   = BRIG_TYPE_B32   | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_B64_ARRAY   = BRIG_TYPE_B64   | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_B128_ARRAY  = BRIG_TYPE_B128  | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_SAMP_ARRAY  = BRIG_TYPE_SAMP  | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_ROIMG_ARRAY = BRIG_TYPE_ROIMG | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_WOIMG_ARRAY = BRIG_TYPE_WOIMG | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_RWIMG_ARRAY = BRIG_TYPE_RWIMG | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_SIG32_ARRAY = BRIG_TYPE_SIG32 | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_SIG64_ARRAY = BRIG_TYPE_SIG64 | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_U8X4_ARRAY  = BRIG_TYPE_U8X4  | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_U8X8_ARRAY  = BRIG_TYPE_U8X8  | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_U8X16_ARRAY = BRIG_TYPE_U8X16 | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_U16X2_ARRAY = BRIG_TYPE_U16X2 | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_U16X4_ARRAY = BRIG_TYPE_U16X4 | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_U16X8_ARRAY = BRIG_TYPE_U16X8 | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_U32X2_ARRAY = BRIG_TYPE_U32X2 | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_U32X4_ARRAY = BRIG_TYPE_U32X4 | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_U64X2_ARRAY = BRIG_TYPE_U64X2 | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_S8X4_ARRAY  = BRIG_TYPE_S8X4  | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_S8X8_ARRAY  = BRIG_TYPE_S8X8  | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_S8X16_ARRAY = BRIG_TYPE_S8X16 | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_S16X2_ARRAY = BRIG_TYPE_S16X2 | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_S16X4_ARRAY = BRIG_TYPE_S16X4 | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_S16X8_ARRAY = BRIG_TYPE_S16X8 | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_S32X2_ARRAY = BRIG_TYPE_S32X2 | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_S32X4_ARRAY = BRIG_TYPE_S32X4 | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_S64X2_ARRAY = BRIG_TYPE_S64X2 | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_F16X2_ARRAY = BRIG_TYPE_F16X2 | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_F16X4_ARRAY = BRIG_TYPE_F16X4 | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_F16X8_ARRAY = BRIG_TYPE_F16X8 | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_F32X2_ARRAY = BRIG_TYPE_F32X2 | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_F32X4_ARRAY = BRIG_TYPE_F32X4 | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_F64X2_ARRAY = BRIG_TYPE_F64X2 | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+
+    // Used internally
+    BRIG_TYPE_INVALID = (unsigned) -1 //.skip
+};
+
+enum BrigVariableModifierMask {
+
+    //.nodump
+
+    BRIG_VARIABLE_DEFINITION = 1,
+    BRIG_VARIABLE_CONST = 2
+};
+
+enum BrigWidth {
+
+    //.tddef=1
+    //
+    //.print={ s/^BRIG_WIDTH_//; "_width($_)" }
+
+    BRIG_WIDTH_NONE = 0,
+    BRIG_WIDTH_1 = 1,
+    BRIG_WIDTH_2 = 2,
+    BRIG_WIDTH_4 = 3,
+    BRIG_WIDTH_8 = 4,
+    BRIG_WIDTH_16 = 5,
+    BRIG_WIDTH_32 = 6,
+    BRIG_WIDTH_64 = 7,
+    BRIG_WIDTH_128 = 8,
+    BRIG_WIDTH_256 = 9,
+    BRIG_WIDTH_512 = 10,
+    BRIG_WIDTH_1024 = 11,
+    BRIG_WIDTH_2048 = 12,
+    BRIG_WIDTH_4096 = 13,
+    BRIG_WIDTH_8192 = 14,
+    BRIG_WIDTH_16384 = 15,
+    BRIG_WIDTH_32768 = 16,
+    BRIG_WIDTH_65536 = 17,
+    BRIG_WIDTH_131072 = 18,
+    BRIG_WIDTH_262144 = 19,
+    BRIG_WIDTH_524288 = 20,
+    BRIG_WIDTH_1048576 = 21,
+    BRIG_WIDTH_2097152 = 22,
+    BRIG_WIDTH_4194304 = 23,
+    BRIG_WIDTH_8388608 = 24,
+    BRIG_WIDTH_16777216 = 25,
+    BRIG_WIDTH_33554432 = 26,
+    BRIG_WIDTH_67108864 = 27,
+    BRIG_WIDTH_134217728 = 28,
+    BRIG_WIDTH_268435456 = 29,
+    BRIG_WIDTH_536870912 = 30,
+    BRIG_WIDTH_1073741824 = 31,
+    BRIG_WIDTH_2147483648 = 32,
+    BRIG_WIDTH_WAVESIZE = 33,
+    BRIG_WIDTH_ALL = 34,
+
+    BRIG_WIDTH_LAST //.skip
+};
+
+struct BrigUInt64 { //.isroot //.standalone
+    uint32_t lo;     //.defValue=0
+    uint32_t hi;     //.defValue=0
+
+    //+hcode KLASS& operator=(uint64_t rhs);
+    //+hcode operator uint64_t();
+    //+implcode inline KLASS& KLASS::operator=(uint64_t rhs) { lo() = (uint32_t)rhs; hi() = (uint32_t)(rhs >> 32); return *this; }
+    //+implcode inline KLASS::operator uint64_t() { return ((uint64_t)hi()) << 32 | lo(); }
+};
+
+struct BrigAluModifier { //.isroot //.standalone
+    BrigAluModifier8_t allBits; //.defValue=0
+    //^^ bool ftz; //.wtype=BitValRef<0>
+};
+
+struct BrigBase { //.nowrap
+    uint16_t byteCount;
+    BrigKind16_t kind;
+};
+
+//.alias Code:Base { //.generic //.isroot //.section=BRIG_SECTION_INDEX_CODE };
+//.alias Directive:Code { //.generic };
+//.alias Operand:Base { //.generic //.isroot //.section=BRIG_SECTION_INDEX_OPERAND };
+
+struct BrigData {
+    //.nowrap
+    uint32_t byteCount;
+    uint8_t bytes[1];
+};
+
+struct BrigExecutableModifier { //.isroot //.standalone
+    BrigExecutableModifier8_t allBits; //.defValue=0
+    //^^ bool isDefinition; //.wtype=BitValRef<0>
+};
+
+struct BrigMemoryModifier { //.isroot //.standalone
+    BrigMemoryModifier8_t allBits; //.defValue=0
+    //^^ bool isConst; //.wtype=BitValRef<0>
+};
+
+struct BrigSegCvtModifier { //.isroot //.standalone
+    BrigSegCvtModifier8_t allBits; //.defValue=0
+    //^^ bool isNoNull; //.wtype=BitValRef<0>
+};
+
+struct BrigVariableModifier { //.isroot //.standalone
+    BrigVariableModifier8_t allBits;    //.defValue=0
+
+    //^^ bool isDefinition;     //.wtype=BitValRef<0>
+    //^^ bool isConst;          //.wtype=BitValRef<1>
+};
+
+struct BrigDirectiveArgBlockEnd {
+    BrigBase base;
+};
+
+struct BrigDirectiveArgBlockStart {
+    BrigBase base;
+};
+
+struct BrigDirectiveComment {
+    BrigBase base;
+    BrigDataOffsetString32_t name;
+};
+
+struct BrigDirectiveControl {
+    BrigBase base;
+    BrigControlDirective16_t control;
+    uint16_t reserved; //.defValue=0
+    BrigDataOffsetOperandList32_t operands;
+};
+
+struct BrigDirectiveExecutable { //.generic
+    BrigBase base;
+    BrigDataOffsetString32_t name;
+    uint16_t outArgCount; //.defValue=0
+    uint16_t inArgCount;  //.defValue=0
+    BrigCodeOffset32_t firstInArg;
+    BrigCodeOffset32_t firstCodeBlockEntry;
+    BrigCodeOffset32_t nextModuleEntry;
+    BrigExecutableModifier modifier; //.acc=subItem<ExecutableModifier> //.wtype=ExecutableModifier
+    BrigLinkage8_t linkage;
+    uint16_t reserved; //.defValue=0
+};
+
+//.alias DirectiveKernel:DirectiveExecutable { };
+//.alias DirectiveFunction:DirectiveExecutable { };
+//.alias DirectiveSignature:DirectiveExecutable { };
+//.alias DirectiveIndirectFunction:DirectiveExecutable { };
+
+struct BrigDirectiveExtension {
+    BrigBase base;
+    BrigDataOffsetString32_t name;
+};
+
+struct BrigDirectiveFbarrier {
+    BrigBase base;
+    BrigDataOffsetString32_t name;
+    BrigVariableModifier modifier; //.acc=subItem<VariableModifier> //.wtype=VariableModifier
+    BrigLinkage8_t linkage;
+    uint16_t reserved; //.defValue=0
+};
+
+struct BrigDirectiveLabel {
+    BrigBase base;
+    BrigDataOffsetString32_t name;
+};
+
+struct BrigDirectiveLoc {
+    BrigBase base;
+    BrigDataOffsetString32_t filename;
+    uint32_t line;
+    uint32_t column; //.defValue=1
+};
+
+struct BrigDirectiveNone { //.enum=BRIG_KIND_NONE
+    BrigBase base;
+};
+
+struct BrigDirectivePragma {
+    BrigBase base;
+    BrigDataOffsetOperandList32_t operands;
+};
+
+struct BrigDirectiveVariable {
+    BrigBase base;
+    BrigDataOffsetString32_t name;
+    BrigOperandOffset32_t init;
+    BrigType16_t type;
+
+    //+hcode bool isArray();
+    //+implcode inline bool KLASS::isArray() { return isArrayType(type()); }
+
+    //+hcode unsigned elementType();
+    //+implcode inline unsigned KLASS::elementType() { return isArray()? arrayType2elementType(type()) : type(); }
+
+    BrigSegment8_t segment;
+    BrigAlignment8_t align;
+    BrigUInt64 dim; //.acc=subItem<UInt64> //.wtype=UInt64
+    BrigVariableModifier modifier; //.acc=subItem<VariableModifier> //.wtype=VariableModifier
+    BrigLinkage8_t linkage;
+    BrigAllocation8_t allocation;
+    uint8_t reserved; //.defValue=0
+};
+
+struct BrigDirectiveModule {
+    BrigBase base;
+    BrigDataOffsetString32_t name;
+    BrigVersion32_t hsailMajor;         //.wtype=ValRef<uint32_t>
+    BrigVersion32_t hsailMinor;         //.wtype=ValRef<uint32_t>
+    BrigProfile8_t profile;
+    BrigMachineModel8_t machineModel;
+    BrigRound8_t defaultFloatRound;
+    uint8_t reserved;                   //.defValue=0
+};
+
+struct BrigInstBase { //.wname=Inst //.generic //.parent=BrigCode
+    BrigBase base;
+    BrigOpcode16_t opcode;
+    BrigType16_t type;
+    BrigDataOffsetOperandList32_t operands;
+
+    //+hcode Operand operand(int index);
+    //+implcode inline Operand KLASS::operand(int index) { return operands()[index]; }
+};
+
+struct BrigInstAddr {
+    BrigInstBase base;
+    BrigSegment8_t segment;
+    uint8_t reserved[3]; //.defValue=0
+};
+
+struct BrigInstAtomic {
+    BrigInstBase base;
+    BrigSegment8_t segment;
+    BrigMemoryOrder8_t memoryOrder;
+    BrigMemoryScope8_t memoryScope;
+    BrigAtomicOperation8_t atomicOperation;
+    uint8_t equivClass;
+    uint8_t reserved[3]; //.defValue=0
+};
+
+struct BrigInstBasic {
+    BrigInstBase base;
+};
+
+struct BrigInstBr {
+    BrigInstBase base;
+    BrigWidth8_t width;
+    uint8_t reserved[3]; //.defValue=0
+};
+
+struct BrigInstCmp {
+    BrigInstBase base;
+    BrigType16_t sourceType;
+    BrigAluModifier modifier; //.acc=subItem<AluModifier> //.wtype=AluModifier
+    BrigCompareOperation8_t compare;
+    BrigPack8_t pack;
+    uint8_t reserved[3]; //.defValue=0
+};
+
+struct BrigInstCvt {
+    BrigInstBase base;
+    BrigType16_t sourceType;
+    BrigAluModifier modifier; //.acc=subItem<AluModifier> //.wtype=AluModifier
+    BrigRound8_t round;
+};
+
+struct BrigInstImage {
+    BrigInstBase base;
+    BrigType16_t imageType;
+    BrigType16_t coordType;
+    BrigImageGeometry8_t geometry;
+    uint8_t equivClass;
+    uint16_t reserved; //.defValue=0
+};
+
+struct BrigInstLane {
+    BrigInstBase base;
+    BrigType16_t sourceType;
+    BrigWidth8_t width;
+    uint8_t reserved; //.defValue=0
+};
+
+struct BrigInstMem {
+    BrigInstBase base;
+    BrigSegment8_t segment;
+    BrigAlignment8_t align;
+    uint8_t equivClass;
+    BrigWidth8_t width;
+    BrigMemoryModifier modifier; //.acc=subItem<MemoryModifier> //.wtype=MemoryModifier
+    uint8_t reserved[3]; //.defValue=0
+};
+
+struct BrigInstMemFence {
+    BrigInstBase base;
+    BrigMemoryOrder8_t memoryOrder;
+    BrigMemoryScope8_t globalSegmentMemoryScope;
+    BrigMemoryScope8_t groupSegmentMemoryScope;
+    BrigMemoryScope8_t imageSegmentMemoryScope;
+};
+
+struct BrigInstMod {
+    BrigInstBase base;
+    BrigAluModifier modifier; //.acc=subItem<AluModifier> //.wtype=AluModifier
+    BrigRound8_t round;
+    BrigPack8_t pack;
+    uint8_t reserved; //.defValue=0
+};
+
+struct BrigInstQueryImage {
+    BrigInstBase base;
+    BrigType16_t imageType;
+    BrigImageGeometry8_t geometry;
+    BrigImageQuery8_t imageQuery;
+};
+
+struct BrigInstQuerySampler {
+    BrigInstBase base;
+    BrigSamplerQuery8_t samplerQuery;
+    uint8_t reserved[3]; //.defValue=0
+};
+
+struct BrigInstQueue {
+    BrigInstBase base;
+    BrigSegment8_t segment;
+    BrigMemoryOrder8_t memoryOrder;
+    uint16_t reserved; //.defValue=0
+};
+
+struct BrigInstSeg {
+    BrigInstBase base;
+    BrigSegment8_t segment;
+    uint8_t reserved[3]; //.defValue=0
+};
+
+struct BrigInstSegCvt {
+    BrigInstBase base;
+    BrigType16_t sourceType;
+    BrigSegment8_t segment;
+    BrigSegCvtModifier modifier; //.acc=subItem<SegCvtModifier> //.wtype=SegCvtModifier
+};
+
+struct BrigInstSignal {
+    BrigInstBase base;
+    BrigType16_t signalType;
+    BrigMemoryOrder8_t memoryOrder;
+    BrigAtomicOperation8_t signalOperation;
+};
+
+struct BrigInstSourceType {
+    BrigInstBase base;
+    BrigType16_t sourceType;
+    uint16_t reserved; //.defValue=0
+};
+
+struct BrigOperandAddress {
+    BrigBase base;
+    BrigCodeOffset32_t symbol; //.wtype=ItemRef<DirectiveVariable>
+    BrigOperandOffset32_t reg; //.wtype=ItemRef<OperandRegister>
+    BrigUInt64 offset; //.acc=subItem<UInt64> //.wtype=UInt64
+};
+
+struct BrigOperandAlign {
+    BrigBase base;
+    BrigAlignment8_t align;
+    uint8_t reserved[3]; //.defValue=0
+};
+
+struct BrigOperandCodeList {
+    BrigBase base;
+    BrigDataOffsetCodeList32_t elements;
+
+    //+hcode unsigned elementCount();
+    //+implcode inline unsigned KLASS::elementCount() { return elements().size(); }
+    //+hcode Code elements(int index);
+    //+implcode inline Code KLASS::elements(int index) { return elements()[index]; }
+};
+
+struct BrigOperandCodeRef {
+    BrigBase base;
+    BrigCodeOffset32_t ref;
+};
+
+struct BrigOperandConstantBytes {
+    BrigBase base;
+    BrigType16_t type; //.defValue=0
+    uint16_t reserved; //.defValue=0
+    BrigDataOffsetString32_t bytes;
+};
+
+struct BrigOperandConstantOperandList {
+    BrigBase base;
+    BrigType16_t type;
+    uint16_t reserved; //.defValue=0
+    BrigDataOffsetOperandList32_t elements;
+
+    //+hcode unsigned elementCount();
+    //+implcode inline unsigned KLASS::elementCount() { return elements().size(); }
+    //+hcode Operand elements(int index);
+    //+implcode inline Operand KLASS::elements(int index) { return elements()[index]; }
+};
+
+struct BrigOperandConstantImage {
+    BrigBase base;
+    BrigType16_t type;
+    BrigImageGeometry8_t geometry;
+    BrigImageChannelOrder8_t channelOrder;
+    BrigImageChannelType8_t channelType;
+    uint8_t reserved[3]; //.defValue=0
+    BrigUInt64 width;    //.acc=subItem<UInt64> //.wtype=UInt64
+    BrigUInt64 height;   //.acc=subItem<UInt64> //.wtype=UInt64
+    BrigUInt64 depth;    //.acc=subItem<UInt64> //.wtype=UInt64
+    BrigUInt64 array;    //.acc=subItem<UInt64> //.wtype=UInt64
+};
+
+struct BrigOperandOperandList {
+    BrigBase base;
+    BrigDataOffsetOperandList32_t elements;
+
+    //+hcode unsigned elementCount();
+    //+implcode inline unsigned KLASS::elementCount() { return elements().size(); }
+    //+hcode Operand elements(int index);
+    //+implcode inline Operand KLASS::elements(int index) { return elements()[index]; }
+};
+
+struct BrigOperandRegister {
+    BrigBase base;
+    BrigRegisterKind16_t regKind;
+    uint16_t regNum;
+};
+
+struct BrigOperandConstantSampler {
+    BrigBase base;
+    BrigType16_t type;
+    BrigSamplerCoordNormalization8_t coord;
+    BrigSamplerFilter8_t filter;
+    BrigSamplerAddressing8_t addressing;
+    uint8_t reserved[3]; //.defValue=0
+};
+
+struct BrigOperandString {
+    BrigBase base;
+    BrigDataOffsetString32_t string;
+};
+
+struct BrigOperandWavesize {
+    BrigBase base;
+};
+
+//.ignore{
+
+enum BrigExceptionsMask {
+    BRIG_EXCEPTIONS_INVALID_OPERATION = 1 << 0,
+    BRIG_EXCEPTIONS_DIVIDE_BY_ZERO = 1 << 1,
+    BRIG_EXCEPTIONS_OVERFLOW = 1 << 2,
+    BRIG_EXCEPTIONS_UNDERFLOW = 1 << 3,
+    BRIG_EXCEPTIONS_INEXACT = 1 << 4,
+
+    BRIG_EXCEPTIONS_FIRST_USER_DEFINED = 1 << 16
+};
+
+struct BrigSectionHeader {
+    uint64_t byteCount;
+    uint32_t headerByteCount;
+    uint32_t nameLength;
+    uint8_t name[1];
+};
+
+#define MODULE_IDENTIFICATION_LENGTH (8)
+
+struct BrigModuleHeader {
+    char identification[MODULE_IDENTIFICATION_LENGTH];
+    BrigVersion32_t brigMajor;
+    BrigVersion32_t brigMinor;
+    uint64_t byteCount;
+    uint8_t hash[64];
+    uint32_t reserved;
+    uint32_t sectionCount;
+    uint64_t sectionIndex;
+};
+
+typedef BrigModuleHeader* BrigModule_t;
+
+#endif // defined(INCLUDED_BRIG_H)
+//}
diff --git a/src/arch/hsail/SConscript b/src/arch/hsail/SConscript
new file mode 100644
index 000000000..3455823a6
--- /dev/null
+++ b/src/arch/hsail/SConscript
@@ -0,0 +1,54 @@
+# -*- mode:python -*-
+
+#  Copyright (c) 2015 Advanced Micro Devices, Inc.
+#  All rights reserved.
+#
+#  For use for simulation and test purposes only
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#
+#  1. Redistributions of source code must retain the above copyright notice,
+#  this list of conditions and the following disclaimer.
+#
+#  2. Redistributions in binary form must reproduce the above copyright notice,
+#  this list of conditions and the following disclaimer in the documentation
+#  and/or other materials provided with the distribution.
+#
+#  3. Neither the name of the copyright holder nor the names of its contributors
+#  may be used to endorse or promote products derived from this software
+#  without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+#  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+#  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+#  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+#  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+#  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+#  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+#  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+#  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+#  POSSIBILITY OF SUCH DAMAGE.
+#
+#  Author: Anthony Gutierrez
+#
+
+Import('*')
+
+if not env['BUILD_GPU']:
+    Return()
+
+if env['TARGET_GPU_ISA'] == 'hsail':
+    env.Command(['insts/gen_decl.hh', 'gpu_decoder.cc', 'insts/gen_exec.cc'],
+                'gen.py', '$SOURCE $TARGETS')
+
+    Source('generic_types.cc')
+    Source('gpu_decoder.cc')
+    Source('insts/branch.cc')
+    Source('insts/gen_exec.cc')
+    Source('insts/gpu_static_inst.cc')
+    Source('insts/main.cc')
+    Source('insts/pseudo_inst.cc')
+    Source('insts/mem.cc')
+    Source('operand.cc')
diff --git a/src/arch/hsail/SConsopts b/src/arch/hsail/SConsopts
new file mode 100644
index 000000000..641963c82
--- /dev/null
+++ b/src/arch/hsail/SConsopts
@@ -0,0 +1,40 @@
+# -*- mode:python -*-
+
+#
+#  Copyright (c) 2015 Advanced Micro Devices, Inc.
+#  All rights reserved.
+#
+#  For use for simulation and test purposes only
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#
+#  1. Redistributions of source code must retain the above copyright notice,
+#  this list of conditions and the following disclaimer.
+#
+#  2. Redistributions in binary form must reproduce the above copyright notice,
+#  this list of conditions and the following disclaimer in the documentation
+#  and/or other materials provided with the distribution.
+#
+#  3. Neither the name of the copyright holder nor the names of its contributors
+#  may be used to endorse or promote products derived from this software
+#  without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+#  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+#  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+#  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+#  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+#  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+#  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+#  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+#  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+#  POSSIBILITY OF SUCH DAMAGE.
+#
+#  Author: Anthony Gutierrez
+#
+
+Import('*')
+
+all_gpu_isa_list.append('hsail')
diff --git a/src/arch/hsail/gen.py b/src/arch/hsail/gen.py
new file mode 100755
index 000000000..f2996019b
--- /dev/null
+++ b/src/arch/hsail/gen.py
@@ -0,0 +1,806 @@
+#! /usr/bin/python
+
+#
+#  Copyright (c) 2015 Advanced Micro Devices, Inc.
+#  All rights reserved.
+#
+#  For use for simulation and test purposes only
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#
+#  1. Redistributions of source code must retain the above copyright notice,
+#  this list of conditions and the following disclaimer.
+#
+#  2. Redistributions in binary form must reproduce the above copyright notice,
+#  this list of conditions and the following disclaimer in the documentation
+#  and/or other materials provided with the distribution.
+#
+#  3. Neither the name of the copyright holder nor the names of its contributors
+#  may be used to endorse or promote products derived from this software
+#  without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+#  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+#  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+#  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+#  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+#  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+#  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+#  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+#  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+#  POSSIBILITY OF SUCH DAMAGE.
+#
+#  Author: Steve Reinhardt
+#
+
+import sys, re
+
+from m5.util import code_formatter
+
+if len(sys.argv) != 4:
+    print "Error: need 3 args (file names)"
+    sys.exit(0)
+
+header_code = code_formatter()
+decoder_code = code_formatter()
+exec_code = code_formatter()
+
+###############
+#
+# Generate file prologs (includes etc.)
+#
+###############
+
+header_code('''
+#include "arch/hsail/insts/decl.hh"
+#include "base/bitfield.hh"
+#include "gpu-compute/hsail_code.hh"
+#include "gpu-compute/wavefront.hh"
+
+namespace HsailISA
+{
+''')
+header_code.indent()
+
+decoder_code('''
+#include "arch/hsail/gpu_decoder.hh"
+#include "arch/hsail/insts/branch.hh"
+#include "arch/hsail/insts/decl.hh"
+#include "arch/hsail/insts/gen_decl.hh"
+#include "arch/hsail/insts/mem.hh"
+#include "arch/hsail/insts/mem_impl.hh"
+#include "gpu-compute/brig_object.hh"
+
+namespace HsailISA
+{
+    std::vector<GPUStaticInst*> Decoder::decodedInsts;
+
+    GPUStaticInst*
+    Decoder::decode(MachInst machInst)
+    {
+        using namespace Brig;
+
+        const BrigInstBase *ib = machInst.brigInstBase;
+        const BrigObject *obj = machInst.brigObj;
+
+        switch(ib->opcode) {
+''')
+decoder_code.indent()
+decoder_code.indent()
+
+exec_code('''
+#include "arch/hsail/insts/gen_decl.hh"
+#include "base/intmath.hh"
+
+namespace HsailISA
+{
+''')
+exec_code.indent()
+
+###############
+#
+# Define code templates for class declarations (for header file)
+#
+###############
+
+# Basic header template for an instruction with no template parameters.
+header_template_nodt = '''
+class $class_name : public $base_class
+{
+  public:
+    typedef $base_class Base;
+
+    $class_name(const Brig::BrigInstBase *ib, const BrigObject *obj)
+       : Base(ib, obj, "$opcode")
+    {
+    }
+
+    void execute(GPUDynInstPtr gpuDynInst);
+};
+
+'''
+
+# Basic header template for an instruction with a single DataType
+# template parameter.
+header_template_1dt = '''
+template<typename DataType>
+class $class_name : public $base_class<DataType>
+{
+  public:
+    typedef $base_class<DataType> Base;
+    typedef typename DataType::CType CType;
+
+    $class_name(const Brig::BrigInstBase *ib, const BrigObject *obj)
+       : Base(ib, obj, "$opcode")
+    {
+    }
+
+    void execute(GPUDynInstPtr gpuDynInst);
+};
+
+'''
+
+header_template_1dt_noexec = '''
+template<typename DataType>
+class $class_name : public $base_class<DataType>
+{
+  public:
+    typedef $base_class<DataType> Base;
+    typedef typename DataType::CType CType;
+
+    $class_name(const Brig::BrigInstBase *ib, const BrigObject *obj)
+       : Base(ib, obj, "$opcode")
+    {
+    }
+};
+
+'''
+
+# Same as header_template_1dt, except the base class has a second
+# template parameter NumSrcOperands to allow a variable number of
+# source operands.  Note that since this is implemented with an array,
+# it only works for instructions where all sources are of the same
+# type (like most arithmetics).
+header_template_1dt_varsrcs = '''
+template<typename DataType>
+class $class_name : public $base_class<DataType, $num_srcs>
+{
+  public:
+    typedef $base_class<DataType, $num_srcs> Base;
+    typedef typename DataType::CType CType;
+
+    $class_name(const Brig::BrigInstBase *ib, const BrigObject *obj)
+       : Base(ib, obj, "$opcode")
+    {
+    }
+
+    void execute(GPUDynInstPtr gpuDynInst);
+};
+
+'''
+
+# Header template for instruction with two DataType template
+# parameters, one for the dest and one for the source.  This is used
+# by compare and convert.
+header_template_2dt = '''
+template<typename DestDataType, class SrcDataType>
+class $class_name : public $base_class<DestDataType, SrcDataType>
+{
+  public:
+    typedef $base_class<DestDataType, SrcDataType> Base;
+    typedef typename DestDataType::CType DestCType;
+    typedef typename SrcDataType::CType SrcCType;
+
+    $class_name(const Brig::BrigInstBase *ib, const BrigObject *obj)
+       : Base(ib, obj, "$opcode")
+    {
+    }
+
+    void execute(GPUDynInstPtr gpuDynInst);
+};
+
+'''
+
+header_templates = {
+    'ArithInst': header_template_1dt_varsrcs,
+    'CmovInst': header_template_1dt,
+    'ClassInst': header_template_1dt,
+    'ShiftInst': header_template_1dt,
+    'ExtractInsertInst': header_template_1dt,
+    'CmpInst': header_template_2dt,
+    'CvtInst': header_template_2dt,
+    'LdInst': '',
+    'StInst': '',
+    'SpecialInstNoSrc': header_template_nodt,
+    'SpecialInst1Src': header_template_nodt,
+    'SpecialInstNoSrcNoDest': '',
+}
+
+###############
+#
+# Define code templates for exec functions
+#
+###############
+
+# exec function body
+exec_template_nodt_nosrc = '''
+void
+$class_name::execute(GPUDynInstPtr gpuDynInst)
+{
+    Wavefront *w = gpuDynInst->wavefront();
+
+    typedef Base::DestCType DestCType;
+
+    const VectorMask &mask = w->get_pred();
+
+    for (int lane = 0; lane < VSZ; ++lane) {
+        if (mask[lane]) {
+            DestCType dest_val = $expr;
+            this->dest.set(w, lane, dest_val);
+        }
+    }
+}
+
+'''
+
+exec_template_nodt_1src = '''
+void
+$class_name::execute(GPUDynInstPtr gpuDynInst)
+{
+    Wavefront *w = gpuDynInst->wavefront();
+
+    typedef Base::DestCType DestCType;
+    typedef Base::SrcCType  SrcCType;
+
+    const VectorMask &mask = w->get_pred();
+
+    for (int lane = 0; lane < VSZ; ++lane) {
+        if (mask[lane]) {
+            SrcCType src_val0 = this->src0.get<SrcCType>(w, lane);
+            DestCType dest_val = $expr;
+
+            this->dest.set(w, lane, dest_val);
+        }
+    }
+}
+
+'''
+
+exec_template_1dt_varsrcs = '''
+template<typename DataType>
+void
+$class_name<DataType>::execute(GPUDynInstPtr gpuDynInst)
+{
+    Wavefront *w = gpuDynInst->wavefront();
+
+    const VectorMask &mask = w->get_pred();
+
+    for (int lane = 0; lane < VSZ; ++lane) {
+        if (mask[lane]) {
+            CType dest_val;
+            if ($dest_is_src_flag) {
+                dest_val = this->dest.template get<CType>(w, lane);
+            }
+
+            CType src_val[$num_srcs];
+
+            for (int i = 0; i < $num_srcs; ++i) {
+                src_val[i] = this->src[i].template get<CType>(w, lane);
+            }
+
+            dest_val = (CType)($expr);
+
+            this->dest.set(w, lane, dest_val);
+        }
+    }
+}
+
+'''
+
+exec_template_1dt_3srcs = '''
+template<typename DataType>
+void
+$class_name<DataType>::execute(GPUDynInstPtr gpuDynInst)
+{
+    Wavefront *w = gpuDynInst->wavefront();
+
+    typedef typename Base::Src0CType Src0T;
+    typedef typename Base::Src1CType Src1T;
+    typedef typename Base::Src2CType Src2T;
+
+    const VectorMask &mask = w->get_pred();
+
+    for (int lane = 0; lane < VSZ; ++lane) {
+        if (mask[lane]) {
+            CType dest_val;
+
+            if ($dest_is_src_flag) {
+                dest_val = this->dest.template get<CType>(w, lane);
+            }
+
+            Src0T src_val0 = this->src0.template get<Src0T>(w, lane);
+            Src1T src_val1 = this->src1.template get<Src1T>(w, lane);
+            Src2T src_val2 = this->src2.template get<Src2T>(w, lane);
+
+            dest_val = $expr;
+
+            this->dest.set(w, lane, dest_val);
+        }
+    }
+}
+
+'''
+
+exec_template_1dt_2src_1dest = '''
+template<typename DataType>
+void
+$class_name<DataType>::execute(GPUDynInstPtr gpuDynInst)
+{
+    Wavefront *w = gpuDynInst->wavefront();
+
+    typedef typename Base::DestCType DestT;
+    typedef CType Src0T;
+    typedef typename Base::Src1CType Src1T;
+
+    const VectorMask &mask = w->get_pred();
+
+    for (int lane = 0; lane < VSZ; ++lane) {
+        if (mask[lane]) {
+            DestT dest_val;
+            if ($dest_is_src_flag) {
+                dest_val = this->dest.template get<DestT>(w, lane);
+            }
+            Src0T src_val0 = this->src0.template get<Src0T>(w, lane);
+            Src1T src_val1 = this->src1.template get<Src1T>(w, lane);
+
+            dest_val = $expr;
+
+            this->dest.set(w, lane, dest_val);
+        }
+    }
+}
+
+'''
+
+exec_template_shift = '''
+template<typename DataType>
+void
+$class_name<DataType>::execute(GPUDynInstPtr gpuDynInst)
+{
+    Wavefront *w = gpuDynInst->wavefront();
+
+    const VectorMask &mask = w->get_pred();
+    for (int lane = 0; lane < VSZ; ++lane) {
+        if (mask[lane]) {
+            CType dest_val;
+
+            if ($dest_is_src_flag) {
+                dest_val = this->dest.template get<CType>(w, lane);
+            }
+
+            CType src_val0 = this->src0.template get<CType>(w, lane);
+            uint32_t src_val1 = this->src1.template get<uint32_t>(w, lane);
+
+            dest_val = $expr;
+
+            this->dest.set(w, lane, dest_val);
+        }
+    }
+}
+
+'''
+
+exec_template_2dt = '''
+template<typename DestDataType, class SrcDataType>
+void
+$class_name<DestDataType, SrcDataType>::execute(GPUDynInstPtr gpuDynInst)
+{
+    Wavefront *w = gpuDynInst->wavefront();
+
+    const VectorMask &mask = w->get_pred();
+
+    for (int lane = 0; lane < VSZ; ++lane) {
+        if (mask[lane]) {
+            DestCType dest_val;
+            SrcCType src_val[$num_srcs];
+
+            for (int i = 0; i < $num_srcs; ++i) {
+                src_val[i] = this->src[i].template get<SrcCType>(w, lane);
+            }
+
+            dest_val = $expr;
+
+            this->dest.set(w, lane, dest_val);
+        }
+    }
+}
+
+'''
+
+exec_templates = {
+    'ArithInst': exec_template_1dt_varsrcs,
+    'CmovInst': exec_template_1dt_3srcs,
+    'ExtractInsertInst': exec_template_1dt_3srcs,
+    'ClassInst': exec_template_1dt_2src_1dest,
+    'CmpInst': exec_template_2dt,
+    'CvtInst': exec_template_2dt,
+    'LdInst': '',
+    'StInst': '',
+    'SpecialInstNoSrc': exec_template_nodt_nosrc,
+    'SpecialInst1Src': exec_template_nodt_1src,
+    'SpecialInstNoSrcNoDest': '',
+}
+
+###############
+#
+# Define code templates for the decoder cases
+#
+###############
+
+# decode template for nodt-opcode case
+decode_nodt_template = '''
+  case BRIG_OPCODE_$brig_opcode_upper: return $constructor(ib, obj);'''
+
+decode_case_prolog_class_inst = '''
+  case BRIG_OPCODE_$brig_opcode_upper:
+    {
+        //const BrigOperandBase *baseOp = obj->getOperand(ib->operands[1]);
+        BrigType16_t type = ((BrigInstSourceType*)ib)->sourceType;
+        //switch (baseOp->kind) {
+        //    case BRIG_OPERAND_REG:
+        //        type = ((const BrigOperandReg*)baseOp)->type;
+        //        break;
+        //    case BRIG_OPERAND_IMMED:
+        //        type = ((const BrigOperandImmed*)baseOp)->type;
+        //        break;
+        //    default:
+        //        fatal("CLASS unrecognized kind of operand %d\\n",
+        //               baseOp->kind);
+        //}
+        switch (type) {'''
+
+# common prolog for 1dt- or 2dt-opcode case: switch on data type
+decode_case_prolog = '''
+  case BRIG_OPCODE_$brig_opcode_upper:
+    {
+        switch (ib->type) {'''
+
+# single-level decode case entry (for 1dt opcodes)
+decode_case_entry = \
+'      case BRIG_TYPE_$type_name: return $constructor(ib, obj);'
+
+decode_store_prolog = \
+'      case BRIG_TYPE_$type_name: {'
+
+decode_store_case_epilog = '''
+    }'''
+
+decode_store_case_entry = \
+'          return $constructor(ib, obj);'
+
+# common epilog for type switch
+decode_case_epilog = '''
+          default: fatal("$brig_opcode_upper: unrecognized type %d\\n",
+              ib->type);
+        }
+    }
+    break;'''
+
+# Additional templates for nested decode on a second type field (for
+# compare and convert).  These are used in place of the
+# decode_case_entry template to create a second-level switch on on the
+# second type field inside each case of the first-level type switch.
+# Because the name and location of the second type can vary, the Brig
+# instruction type must be provided in $brig_type, and the name of the
+# second type field must be provided in $type_field.
+decode_case2_prolog = '''
+        case BRIG_TYPE_$type_name:
+          switch (((Brig$brig_type*)ib)->$type2_field) {'''
+
+decode_case2_entry = \
+'          case BRIG_TYPE_$type2_name: return $constructor(ib, obj);'
+
+decode_case2_epilog = '''
+          default: fatal("$brig_opcode_upper: unrecognized $type2_field %d\\n",
+                         ((Brig$brig_type*)ib)->$type2_field);
+        }
+        break;'''
+
+# Figure out how many source operands an expr needs by looking for the
+# highest-numbered srcN value referenced.  Since sources are numbered
+# starting at 0, the return value is N+1.
+def num_src_operands(expr):
+    if expr.find('src2') != -1:
+        return 3
+    elif expr.find('src1') != -1:
+        return 2
+    elif expr.find('src0') != -1:
+        return 1
+    else:
+        return 0
+
+###############
+#
+# Define final code generation methods
+#
+# The gen_nodt, and gen_1dt, and gen_2dt methods are the interface for
+# generating actual instructions.
+#
+###############
+
+# Generate class declaration, exec function, and decode switch case
+# for an brig_opcode with a single-level type switch.  The 'types'
+# parameter is a list or tuple of types for which the instruction
+# should be instantiated.
+def gen(brig_opcode, types=None, expr=None, base_class='ArithInst',
+        type2_info=None, constructor_prefix='new ', is_store=False):
+    brig_opcode_upper = brig_opcode.upper()
+    class_name = brig_opcode
+    opcode = class_name.lower()
+
+    if base_class == 'ArithInst':
+        # note that expr must be provided with ArithInst so we can
+        # derive num_srcs for the template
+        assert expr
+
+    if expr:
+        # Derive several bits of info from expr.  If expr is not used,
+        # this info will be irrelevant.
+        num_srcs = num_src_operands(expr)
+        # if the RHS expression includes 'dest', then we're doing an RMW
+        # on the reg and we need to treat it like a source
+        dest_is_src = expr.find('dest') != -1
+        dest_is_src_flag = str(dest_is_src).lower() # for C++
+        if base_class in ['ShiftInst']:
+            expr = re.sub(r'\bsrc(\d)\b', r'src_val\1', expr)
+        elif base_class in ['ArithInst', 'CmpInst', 'CvtInst']:
+            expr = re.sub(r'\bsrc(\d)\b', r'src_val[\1]', expr)
+        else:
+            expr = re.sub(r'\bsrc(\d)\b', r'src_val\1', expr)
+        expr = re.sub(r'\bdest\b', r'dest_val', expr)
+
+    # Strip template arguments off of base class before looking up
+    # appropriate templates
+    base_class_base = re.sub(r'<.*>$', '', base_class)
+    header_code(header_templates[base_class_base])
+
+    if base_class.startswith('SpecialInst'):
+        exec_code(exec_templates[base_class_base])
+    elif base_class.startswith('ShiftInst'):
+        header_code(exec_template_shift)
+    else:
+        header_code(exec_templates[base_class_base])
+
+    if not types or isinstance(types, str):
+        # Just a single type
+        constructor = constructor_prefix + class_name
+        decoder_code(decode_nodt_template)
+    else:
+        # multiple types, need at least one level of decode
+        if brig_opcode == 'Class':
+            decoder_code(decode_case_prolog_class_inst)
+        else:
+            decoder_code(decode_case_prolog)
+        if not type2_info:
+            if is_store == False:
+                # single list of types, to basic one-level decode
+                for type_name in types:
+                    full_class_name = '%s<%s>' % (class_name, type_name.upper())
+                    constructor = constructor_prefix + full_class_name
+                    decoder_code(decode_case_entry)
+            else:
+                # single list of types, to basic one-level decode
+                for type_name in types:
+                    decoder_code(decode_store_prolog)
+                    type_size = int(re.findall(r'[0-9]+', type_name)[0])
+                    src_size = 32
+                    type_type = type_name[0]
+                    full_class_name = '%s<%s,%s>' % (class_name, \
+                                                     type_name.upper(), \
+                                                     '%s%d' % \
+                                                     (type_type.upper(), \
+                                                     type_size))
+                    constructor = constructor_prefix + full_class_name
+                    decoder_code(decode_store_case_entry)
+                    decoder_code(decode_store_case_epilog)
+        else:
+            # need secondary type switch (convert, compare)
+            # unpack extra info on second switch
+            (type2_field, types2) = type2_info
+            brig_type = 'Inst%s' % brig_opcode
+            for type_name in types:
+                decoder_code(decode_case2_prolog)
+                fmt = '%s<%s,%%s>' % (class_name, type_name.upper())
+                for type2_name in types2:
+                    full_class_name = fmt % type2_name.upper()
+                    constructor = constructor_prefix + full_class_name
+                    decoder_code(decode_case2_entry)
+
+                decoder_code(decode_case2_epilog)
+
+        decoder_code(decode_case_epilog)
+
+###############
+#
+# Generate instructions
+#
+###############
+
+# handy abbreviations for common sets of types
+
+# arithmetic ops are typically defined only on 32- and 64-bit sizes
+arith_int_types = ('S32', 'U32', 'S64', 'U64')
+arith_float_types = ('F32', 'F64')
+arith_types = arith_int_types + arith_float_types
+
+bit_types = ('B1', 'B32', 'B64')
+
+all_int_types = ('S8', 'U8', 'S16', 'U16') + arith_int_types
+
+# I think you might be able to do 'f16' memory ops too, but we'll
+# ignore them for now.
+mem_types = all_int_types + arith_float_types
+mem_atom_types = all_int_types + ('B32', 'B64')
+
+##### Arithmetic & logical operations
+gen('Add', arith_types, 'src0 + src1')
+gen('Sub', arith_types, 'src0 - src1')
+gen('Mul', arith_types, 'src0 * src1')
+gen('Div', arith_types, 'src0 / src1')
+gen('Min', arith_types, 'std::min(src0, src1)')
+gen('Max', arith_types, 'std::max(src0, src1)')
+gen('Gcnmin', arith_types, 'std::min(src0, src1)')
+
+gen('CopySign', arith_float_types,
+    'src1 < 0 ? -std::abs(src0) : std::abs(src0)')
+gen('Sqrt', arith_float_types, 'sqrt(src0)')
+gen('Floor', arith_float_types, 'floor(src0)')
+
+# "fast" sqrt... same as slow for us
+gen('Nsqrt', arith_float_types, 'sqrt(src0)')
+gen('Nrsqrt', arith_float_types, '1.0/sqrt(src0)')
+gen('Nrcp', arith_float_types, '1.0/src0')
+gen('Fract', arith_float_types,
+    '(src0 >= 0.0)?(src0-floor(src0)):(floor(src0)-src0)')
+
+gen('Ncos', arith_float_types, 'cos(src0)');
+gen('Nsin', arith_float_types, 'sin(src0)');
+
+gen('And', bit_types, 'src0 & src1')
+gen('Or', bit_types,  'src0 | src1')
+gen('Xor', bit_types, 'src0 ^ src1')
+
+gen('Bitselect', bit_types, '(src1 & src0) | (src2 & ~src0)')
+gen('Firstbit',bit_types, 'firstbit(src0)')
+gen('Popcount', ('B32', 'B64'), '__builtin_popcount(src0)')
+
+gen('Shl', arith_int_types, 'src0 << (unsigned)src1', 'ShiftInst')
+gen('Shr', arith_int_types, 'src0 >> (unsigned)src1', 'ShiftInst')
+
+# gen('Mul_hi', types=('s32','u32', '??'))
+# gen('Mul24', types=('s32','u32', '??'))
+gen('Rem', arith_int_types, 'src0 - ((src0 / src1) * src1)')
+
+gen('Abs', arith_types, 'std::abs(src0)')
+gen('Neg', arith_types, '-src0')
+
+gen('Mov', bit_types, 'src0')
+gen('Not', bit_types, 'heynot(src0)')
+
+# mad and fma differ only in rounding behavior, which we don't emulate
+# also there's an integer form of mad, but not of fma
+gen('Mad', arith_types, 'src0 * src1 + src2')
+gen('Fma', arith_float_types, 'src0 * src1 + src2')
+
+#native floating point operations
+gen('Nfma', arith_float_types, 'src0 * src1 + src2')
+
+gen('Cmov', bit_types, 'src0 ? src1 : src2', 'CmovInst')
+gen('BitAlign', bit_types, '(src0 << src2)|(src1 >> (32 - src2))')
+gen('ByteAlign', bit_types, '(src0 << 8 * src2)|(src1 >> (32 - 8 * src2))')
+
+# see base/bitfield.hh
+gen('BitExtract', arith_int_types, 'bits(src0, src1, src1 + src2 - 1)',
+    'ExtractInsertInst')
+
+gen('BitInsert', arith_int_types, 'insertBits(dest, src1, src2, src0)',
+    'ExtractInsertInst')
+
+##### Compare
+gen('Cmp', ('B1', 'S32', 'U32', 'F32'), 'compare(src0, src1, this->cmpOp)',
+    'CmpInst', ('sourceType', arith_types + bit_types))
+gen('Class', arith_float_types, 'fpclassify(src0,src1)','ClassInst')
+
+##### Conversion
+
+# Conversion operations are only defined on B1, not B32 or B64
+cvt_types = ('B1',) + mem_types
+
+gen('Cvt', cvt_types, 'src0', 'CvtInst', ('sourceType', cvt_types))
+
+
+##### Load & Store
+gen('Lda', mem_types, base_class = 'LdInst', constructor_prefix='decode')
+gen('Ld', mem_types, base_class = 'LdInst', constructor_prefix='decode')
+gen('St', mem_types, base_class = 'StInst', constructor_prefix='decode',
+    is_store=True)
+gen('Atomic', mem_atom_types, base_class='StInst', constructor_prefix='decode')
+gen('AtomicNoRet', mem_atom_types, base_class='StInst',
+    constructor_prefix='decode')
+
+gen('Cbr', base_class = 'LdInst', constructor_prefix='decode')
+gen('Br', base_class = 'LdInst', constructor_prefix='decode')
+
+##### Special operations
+def gen_special(brig_opcode, expr, dest_type='U32'):
+    num_srcs = num_src_operands(expr)
+    if num_srcs == 0:
+        base_class = 'SpecialInstNoSrc<%s>' % dest_type
+    elif num_srcs == 1:
+        base_class = 'SpecialInst1Src<%s>' % dest_type
+    else:
+        assert false
+
+    gen(brig_opcode, None, expr, base_class)
+
+gen_special('WorkItemId', 'w->workitemid[src0][lane]')
+gen_special('WorkItemAbsId',
+    'w->workitemid[src0][lane] + (w->workgroupid[src0] * w->workgroupsz[src0])')
+gen_special('WorkGroupId', 'w->workgroupid[src0]')
+gen_special('WorkGroupSize', 'w->workgroupsz[src0]')
+gen_special('CurrentWorkGroupSize', 'w->workgroupsz[src0]')
+gen_special('GridSize', 'w->gridsz[src0]')
+gen_special('GridGroups',
+    'divCeil(w->gridsz[src0],w->workgroupsz[src0])')
+gen_special('LaneId', 'lane')
+gen_special('WaveId', 'w->dynwaveid')
+gen_special('Clock', 'w->computeUnit->shader->tick_cnt', 'U64')
+
+# gen_special('CU'', ')
+
+gen('Ret', base_class='SpecialInstNoSrcNoDest')
+gen('Barrier', base_class='SpecialInstNoSrcNoDest')
+gen('MemFence', base_class='SpecialInstNoSrcNoDest')
+
+# Map magic instructions to the BrigSyscall opcode
+# Magic instructions are defined in magic.hh
+#
+# In the future, real HSA kernel system calls can be implemented and coexist
+# with magic instructions.
+gen('Call', base_class='SpecialInstNoSrcNoDest')
+
+###############
+#
+# Generate file epilogs
+#
+###############
+header_code.dedent()
+header_code('''
+} // namespace HsailISA
+''')
+
+# close off main decode switch
+decoder_code.dedent()
+decoder_code.dedent()
+decoder_code('''
+          default: fatal("unrecognized Brig opcode %d\\n", ib->opcode);
+        } // end switch(ib->opcode)
+    } // end decode()
+} // namespace HsailISA
+''')
+
+exec_code.dedent()
+exec_code('''
+} // namespace HsailISA
+''')
+
+###############
+#
+# Output accumulated code to files
+#
+###############
+header_code.write(sys.argv[1])
+decoder_code.write(sys.argv[2])
+exec_code.write(sys.argv[3])
diff --git a/src/arch/hsail/generic_types.cc b/src/arch/hsail/generic_types.cc
new file mode 100644
index 000000000..0cd55d1d5
--- /dev/null
+++ b/src/arch/hsail/generic_types.cc
@@ -0,0 +1,47 @@
+#include "arch/hsail/generic_types.hh"
+#include "base/misc.hh"
+
+using namespace Brig;
+
+namespace HsailISA
+{
+    Enums::GenericMemoryOrder
+    getGenericMemoryOrder(BrigMemoryOrder brig_memory_order)
+    {
+        switch(brig_memory_order) {
+          case BRIG_MEMORY_ORDER_NONE:
+            return Enums::MEMORY_ORDER_NONE;
+          case BRIG_MEMORY_ORDER_RELAXED:
+            return Enums::MEMORY_ORDER_RELAXED;
+          case BRIG_MEMORY_ORDER_SC_ACQUIRE:
+            return Enums::MEMORY_ORDER_SC_ACQUIRE;
+          case BRIG_MEMORY_ORDER_SC_RELEASE:
+            return Enums::MEMORY_ORDER_SC_RELEASE;
+          case BRIG_MEMORY_ORDER_SC_ACQUIRE_RELEASE:
+            return Enums::MEMORY_ORDER_SC_ACQUIRE_RELEASE;
+          default:
+            fatal("HsailISA::MemInst::getGenericMemoryOrder -> ",
+                  "bad BrigMemoryOrder\n");
+        }
+    }
+
+    Enums::GenericMemoryScope
+    getGenericMemoryScope(BrigMemoryScope brig_memory_scope)
+    {
+        switch(brig_memory_scope) {
+          case BRIG_MEMORY_SCOPE_NONE:
+            return Enums::MEMORY_SCOPE_NONE;
+          case BRIG_MEMORY_SCOPE_WORKITEM:
+            return Enums::MEMORY_SCOPE_WORKITEM;
+          case BRIG_MEMORY_SCOPE_WORKGROUP:
+            return Enums::MEMORY_SCOPE_WORKGROUP;
+          case BRIG_MEMORY_SCOPE_AGENT:
+            return Enums::MEMORY_SCOPE_DEVICE;
+          case BRIG_MEMORY_SCOPE_SYSTEM:
+            return Enums::MEMORY_SCOPE_SYSTEM;
+          default:
+            fatal("HsailISA::MemInst::getGenericMemoryScope -> ",
+                  "bad BrigMemoryScope\n");
+        }
+    }
+} // namespace HsailISA
diff --git a/src/arch/hsail/generic_types.hh b/src/arch/hsail/generic_types.hh
new file mode 100644
index 000000000..50e430bef
--- /dev/null
+++ b/src/arch/hsail/generic_types.hh
@@ -0,0 +1,16 @@
+#ifndef __ARCH_HSAIL_GENERIC_TYPES_HH__
+#define __ARCH_HSAIL_GENERIC_TYPES_HH__
+
+#include "arch/hsail/Brig.h"
+#include "enums/GenericMemoryOrder.hh"
+#include "enums/GenericMemoryScope.hh"
+
+namespace HsailISA
+{
+    Enums::GenericMemoryOrder
+    getGenericMemoryOrder(Brig::BrigMemoryOrder brig_memory_order);
+    Enums::GenericMemoryScope
+    getGenericMemoryScope(Brig::BrigMemoryScope brig_memory_scope);
+} // namespace HsailISA
+
+#endif // __ARCH_HSAIL_GENERIC_TYPES_HH__
diff --git a/src/arch/hsail/gpu_decoder.hh b/src/arch/hsail/gpu_decoder.hh
new file mode 100644
index 000000000..98a689664
--- /dev/null
+++ b/src/arch/hsail/gpu_decoder.hh
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#ifndef __ARCH_HSAIL_GPU_DECODER_HH__
+#define __ARCH_HSAIL_GPU_DECODER_HH__
+
+#include <vector>
+
+#include "arch/hsail/gpu_types.hh"
+
+class BrigObject;
+class GPUStaticInst;
+
+namespace Brig
+{
+    class BrigInstBase;
+}
+
+namespace HsailISA
+{
+    class Decoder
+    {
+      public:
+        GPUStaticInst* decode(MachInst machInst);
+
+        GPUStaticInst*
+        decode(RawMachInst inst)
+        {
+            return inst < decodedInsts.size() ? decodedInsts.at(inst) : nullptr;
+        }
+
+        RawMachInst
+        saveInst(GPUStaticInst *decodedInst)
+        {
+            decodedInsts.push_back(decodedInst);
+
+            return decodedInsts.size() - 1;
+        }
+
+      private:
+        static std::vector<GPUStaticInst*> decodedInsts;
+    };
+} // namespace HsailISA
+
+#endif // __ARCH_HSAIL_GPU_DECODER_HH__
diff --git a/src/arch/hsail/gpu_types.hh b/src/arch/hsail/gpu_types.hh
new file mode 100644
index 000000000..4b3a66a9a
--- /dev/null
+++ b/src/arch/hsail/gpu_types.hh
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#ifndef __ARCH_HSAIL_GPU_TYPES_HH__
+#define __ARCH_HSAIL_GPU_TYPES_HH__
+
+#include <cstdint>
+
+namespace Brig
+{
+    class BrigInstBase;
+}
+
+class BrigObject;
+
+namespace HsailISA
+{
+    // A raw machine instruction represents the raw bits that
+    // our model uses to represent an actual instruction. In
+    // the case of HSAIL this is just an index into a list of
+    // instruction objects.
+    typedef uint64_t RawMachInst;
+
+    // The MachInst is a representation of an instruction
+    // that has more information than just the machine code.
+    // For HSAIL the actual machine code is a BrigInstBase
+    // and the BrigObject contains more pertinent
+    // information related to operaands, etc.
+
+    struct MachInst
+    {
+        const Brig::BrigInstBase *brigInstBase;
+        const BrigObject *brigObj;
+    };
+}
+
+#endif // __ARCH_HSAIL_GPU_TYPES_HH__
diff --git a/src/arch/hsail/insts/branch.cc b/src/arch/hsail/insts/branch.cc
new file mode 100644
index 000000000..d65279cc8
--- /dev/null
+++ b/src/arch/hsail/insts/branch.cc
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#include "arch/hsail/insts/branch.hh"
+
+#include "gpu-compute/hsail_code.hh"
+
+namespace HsailISA
+{
+    GPUStaticInst*
+    decodeBrn(const Brig::BrigInstBase *ib, const BrigObject *obj)
+    {
+        // Detect direct vs indirect branch by seeing whether we have a
+        // register operand.
+        unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+        const Brig::BrigOperand *reg = obj->getOperand(op_offs);
+
+        if (reg->kind == Brig::BRIG_KIND_OPERAND_REGISTER) {
+            return new BrnIndirectInst(ib, obj);
+        } else {
+            return new BrnDirectInst(ib, obj);
+        }
+    }
+
+    GPUStaticInst*
+    decodeCbr(const Brig::BrigInstBase *ib, const BrigObject *obj)
+    {
+        // Detect direct vs indirect branch by seeing whether we have a
+        // second register operand (after the condition).
+        unsigned op_offs = obj->getOperandPtr(ib->operands, 1);
+        const Brig::BrigOperand *reg = obj->getOperand(op_offs);
+
+        if (reg->kind == Brig::BRIG_KIND_OPERAND_REGISTER) {
+            return new CbrIndirectInst(ib, obj);
+        } else {
+            return new CbrDirectInst(ib, obj);
+        }
+    }
+
+    GPUStaticInst*
+    decodeBr(const Brig::BrigInstBase *ib, const BrigObject *obj)
+    {
+        // Detect direct vs indirect branch by seeing whether we have a
+        // second register operand (after the condition).
+        unsigned op_offs = obj->getOperandPtr(ib->operands, 1);
+        const Brig::BrigOperand *reg = obj->getOperand(op_offs);
+
+        if (reg->kind == Brig::BRIG_KIND_OPERAND_REGISTER) {
+            return new BrIndirectInst(ib, obj);
+        } else {
+            return new BrDirectInst(ib, obj);
+        }
+    }
+} // namespace HsailISA
diff --git a/src/arch/hsail/insts/branch.hh b/src/arch/hsail/insts/branch.hh
new file mode 100644
index 000000000..54ad9a042
--- /dev/null
+++ b/src/arch/hsail/insts/branch.hh
@@ -0,0 +1,442 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#ifndef __ARCH_HSAIL_INSTS_BRANCH_HH__
+#define __ARCH_HSAIL_INSTS_BRANCH_HH__
+
+#include "arch/hsail/insts/gpu_static_inst.hh"
+#include "arch/hsail/operand.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/wavefront.hh"
+
+namespace HsailISA
+{
+
+    // The main difference between a direct branch and an indirect branch
+    // is whether the target is a register or a label, so we can share a
+    // lot of code if we template the base implementation on that type.
+    template<typename TargetType>
+    class BrnInstBase : public HsailGPUStaticInst
+    {
+    public:
+        void generateDisassembly();
+
+        Brig::BrigWidth8_t width;
+        TargetType target;
+
+        BrnInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj)
+           : HsailGPUStaticInst(obj, "brn")
+        {
+            o_type = Enums::OT_BRANCH;
+            width = ((Brig::BrigInstBr*)ib)->width;
+            unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+            target.init(op_offs, obj);
+            o_type = Enums::OT_BRANCH;
+        }
+
+        uint32_t getTargetPc()  override { return target.getTarget(0, 0); }
+
+        bool unconditionalJumpInstruction() override { return true; }
+        bool isVectorRegister(int operandIndex) {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            return target.isVectorRegister();
+        }
+        bool isCondRegister(int operandIndex) {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            return target.isCondRegister();
+        }
+        bool isScalarRegister(int operandIndex) {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            return target.isScalarRegister();
+        }
+
+        bool isSrcOperand(int operandIndex) {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            return true;
+        }
+
+        bool isDstOperand(int operandIndex) {
+            return false;
+        }
+
+        int getOperandSize(int operandIndex) {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            return target.opSize();
+        }
+
+        int getRegisterIndex(int operandIndex) {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            return target.regIndex();
+        }
+
+        int getNumOperands() {
+            return 1;
+        }
+
+        void execute(GPUDynInstPtr gpuDynInst);
+    };
+
+    template<typename TargetType>
+    void
+    BrnInstBase<TargetType>::generateDisassembly()
+    {
+        std::string widthClause;
+
+        if (width != 1) {
+            widthClause = csprintf("_width(%d)", width);
+        }
+
+        disassembly = csprintf("%s%s %s", opcode, widthClause,
+                               target.disassemble());
+    }
+
+    template<typename TargetType>
+    void
+    BrnInstBase<TargetType>::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *w = gpuDynInst->wavefront();
+
+        if (getTargetPc() == w->rpc()) {
+            w->popFromReconvergenceStack();
+        } else {
+            // Rpc and execution mask remain the same
+            w->pc(getTargetPc());
+        }
+        w->discardFetch();
+    }
+
+    class BrnDirectInst : public BrnInstBase<LabelOperand>
+    {
+      public:
+        BrnDirectInst(const Brig::BrigInstBase *ib, const BrigObject *obj)
+            : BrnInstBase<LabelOperand>(ib, obj)
+        {
+        }
+        int numSrcRegOperands() { return 0; }
+        int numDstRegOperands() { return 0; }
+    };
+
+    class BrnIndirectInst : public BrnInstBase<SRegOperand>
+    {
+      public:
+        BrnIndirectInst(const Brig::BrigInstBase *ib, const BrigObject *obj)
+            : BrnInstBase<SRegOperand>(ib, obj)
+        {
+        }
+        int numSrcRegOperands() { return target.isVectorRegister(); }
+        int numDstRegOperands() { return 0; }
+    };
+
+    GPUStaticInst* decodeBrn(const Brig::BrigInstBase *ib,
+                             const BrigObject *obj);
+
+    template<typename TargetType>
+    class CbrInstBase : public HsailGPUStaticInst
+    {
+      public:
+        void generateDisassembly();
+
+        Brig::BrigWidth8_t width;
+        CRegOperand cond;
+        TargetType target;
+
+        CbrInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj)
+           : HsailGPUStaticInst(obj, "cbr")
+        {
+            o_type = Enums::OT_BRANCH;
+            width = ((Brig::BrigInstBr *)ib)->width;
+            unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+            cond.init(op_offs, obj);
+            op_offs = obj->getOperandPtr(ib->operands, 1);
+            target.init(op_offs, obj);
+            o_type = Enums::OT_BRANCH;
+        }
+
+        uint32_t getTargetPc() override { return target.getTarget(0, 0); }
+
+        void execute(GPUDynInstPtr gpuDynInst);
+        // Assumption: Target is operand 0, Condition Register is operand 1
+        bool isVectorRegister(int operandIndex) {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            if (!operandIndex)
+                return target.isVectorRegister();
+            else
+                return false;
+        }
+        bool isCondRegister(int operandIndex) {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            if (!operandIndex)
+                return target.isCondRegister();
+            else
+                return true;
+        }
+        bool isScalarRegister(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (!operandIndex)
+                return target.isScalarRegister();
+            else
+                return false;
+        }
+        bool isSrcOperand(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex == 0)
+                return true;
+            return false;
+        }
+        // both Condition Register and Target are source operands
+        bool isDstOperand(int operandIndex) {
+            return false;
+        }
+        int getOperandSize(int operandIndex) {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            if (!operandIndex)
+                return target.opSize();
+            else
+                return 1;
+        }
+        int getRegisterIndex(int operandIndex) {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            if (!operandIndex)
+                return target.regIndex();
+            else
+                return -1;
+         }
+
+        // Operands = Target, Condition Register
+        int getNumOperands() {
+            return 2;
+        }
+    };
+
+    template<typename TargetType>
+    void
+    CbrInstBase<TargetType>::generateDisassembly()
+    {
+        std::string widthClause;
+
+        if (width != 1) {
+            widthClause = csprintf("_width(%d)", width);
+        }
+
+        disassembly = csprintf("%s%s %s,%s", opcode, widthClause,
+                               cond.disassemble(), target.disassemble());
+    }
+
+    template<typename TargetType>
+    void
+    CbrInstBase<TargetType>::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *w = gpuDynInst->wavefront();
+
+        const uint32_t curr_pc = w->pc();
+        const uint32_t curr_rpc = w->rpc();
+        const VectorMask curr_mask = w->execMask();
+
+        /**
+         * TODO: can we move this pop outside the instruction, and
+         * into the wavefront?
+         */
+        w->popFromReconvergenceStack();
+
+        // immediate post-dominator instruction
+        const uint32_t rpc = static_cast<uint32_t>(ipdInstNum());
+        if (curr_rpc != rpc) {
+            w->pushToReconvergenceStack(rpc, curr_rpc, curr_mask);
+        }
+
+        // taken branch
+        const uint32_t true_pc = getTargetPc();
+        VectorMask true_mask;
+        for (unsigned int lane = 0; lane < VSZ; ++lane) {
+            true_mask[lane] = cond.get<bool>(w, lane) & curr_mask[lane];
+        }
+
+        // not taken branch
+        const uint32_t false_pc = curr_pc + 1;
+        assert(true_pc != false_pc);
+        if (false_pc != rpc && true_mask.count() < curr_mask.count()) {
+            VectorMask false_mask = curr_mask & ~true_mask;
+            w->pushToReconvergenceStack(false_pc, rpc, false_mask);
+        }
+
+        if (true_pc != rpc && true_mask.count()) {
+            w->pushToReconvergenceStack(true_pc, rpc, true_mask);
+        }
+        assert(w->pc() != curr_pc);
+        w->discardFetch();
+    }
+
+
+    class CbrDirectInst : public CbrInstBase<LabelOperand>
+    {
+      public:
+        CbrDirectInst(const Brig::BrigInstBase *ib, const BrigObject *obj)
+            : CbrInstBase<LabelOperand>(ib, obj)
+        {
+        }
+        // the source operand of a conditional branch is a Condition
+        // Register which is not stored in the VRF
+        // so we do not count it as a source-register operand
+        // even though, formally, it is one.
+        int numSrcRegOperands() { return 0; }
+        int numDstRegOperands() { return 0; }
+    };
+
+    class CbrIndirectInst : public CbrInstBase<SRegOperand>
+    {
+      public:
+        CbrIndirectInst(const Brig::BrigInstBase *ib, const BrigObject *obj)
+            : CbrInstBase<SRegOperand>(ib, obj)
+        {
+        }
+        // one source operand of the conditional indirect branch is a Condition
+        // register which is not stored in the VRF so we do not count it
+        // as a source-register operand even though, formally, it is one.
+        int numSrcRegOperands() { return target.isVectorRegister(); }
+        int numDstRegOperands() { return 0; }
+    };
+
+    GPUStaticInst* decodeCbr(const Brig::BrigInstBase *ib,
+                             const BrigObject *obj);
+
+    template<typename TargetType>
+    class BrInstBase : public HsailGPUStaticInst
+    {
+      public:
+        void generateDisassembly();
+
+        ImmOperand<uint32_t> width;
+        TargetType target;
+
+        BrInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj)
+           : HsailGPUStaticInst(obj, "br")
+        {
+            o_type = Enums::OT_BRANCH;
+            width.init(((Brig::BrigInstBr *)ib)->width, obj);
+            unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+            target.init(op_offs, obj);
+            o_type = Enums::OT_BRANCH;
+        }
+
+        uint32_t getTargetPc() override { return target.getTarget(0, 0); }
+
+        bool unconditionalJumpInstruction() override { return true; }
+
+        void execute(GPUDynInstPtr gpuDynInst);
+        bool isVectorRegister(int operandIndex) {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            return target.isVectorRegister();
+        }
+        bool isCondRegister(int operandIndex) {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            return target.isCondRegister();
+        }
+        bool isScalarRegister(int operandIndex) {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            return target.isScalarRegister();
+        }
+        bool isSrcOperand(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return true;
+        }
+        bool isDstOperand(int operandIndex) { return false; }
+        int getOperandSize(int operandIndex) {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            return target.opSize();
+        }
+        int getRegisterIndex(int operandIndex) {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            return target.regIndex();
+        }
+        int getNumOperands() { return 1; }
+    };
+
+    template<typename TargetType>
+    void
+    BrInstBase<TargetType>::generateDisassembly()
+    {
+        std::string widthClause;
+
+        if (width.bits != 1) {
+            widthClause = csprintf("_width(%d)", width.bits);
+        }
+
+        disassembly = csprintf("%s%s %s", opcode, widthClause,
+                               target.disassemble());
+    }
+
+    template<typename TargetType>
+    void
+    BrInstBase<TargetType>::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *w = gpuDynInst->wavefront();
+
+        if (getTargetPc() == w->rpc()) {
+            w->popFromReconvergenceStack();
+        } else {
+            // Rpc and execution mask remain the same
+            w->pc(getTargetPc());
+        }
+        w->discardFetch();
+    }
+
+    class BrDirectInst : public BrInstBase<LabelOperand>
+    {
+      public:
+        BrDirectInst(const Brig::BrigInstBase *ib, const BrigObject *obj)
+            : BrInstBase<LabelOperand>(ib, obj)
+        {
+        }
+
+        int numSrcRegOperands() { return 0; }
+        int numDstRegOperands() { return 0; }
+    };
+
+    class BrIndirectInst : public BrInstBase<SRegOperand>
+    {
+      public:
+        BrIndirectInst(const Brig::BrigInstBase *ib, const BrigObject *obj)
+            : BrInstBase<SRegOperand>(ib, obj)
+        {
+        }
+        int numSrcRegOperands() { return target.isVectorRegister(); }
+        int numDstRegOperands() { return 0; }
+    };
+
+    GPUStaticInst* decodeBr(const Brig::BrigInstBase *ib,
+                            const BrigObject *obj);
+} // namespace HsailISA
+
+#endif // __ARCH_HSAIL_INSTS_BRANCH_HH__
diff --git a/src/arch/hsail/insts/decl.hh b/src/arch/hsail/insts/decl.hh
new file mode 100644
index 000000000..e2da501b9
--- /dev/null
+++ b/src/arch/hsail/insts/decl.hh
@@ -0,0 +1,1106 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#ifndef __ARCH_HSAIL_INSTS_DECL_HH__
+#define __ARCH_HSAIL_INSTS_DECL_HH__
+
+#include <cmath>
+
+#include "arch/hsail/generic_types.hh"
+#include "arch/hsail/insts/gpu_static_inst.hh"
+#include "arch/hsail/operand.hh"
+#include "debug/HSAIL.hh"
+#include "enums/OpType.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/shader.hh"
+
+namespace HsailISA
+{
+    template<typename _DestOperand, typename _SrcOperand>
+    class HsailOperandType
+    {
+      public:
+        typedef _DestOperand DestOperand;
+        typedef _SrcOperand SrcOperand;
+    };
+
+    typedef HsailOperandType<CRegOperand, CRegOrImmOperand> CRegOperandType;
+    typedef HsailOperandType<SRegOperand, SRegOrImmOperand> SRegOperandType;
+    typedef HsailOperandType<DRegOperand, DRegOrImmOperand> DRegOperandType;
+
+    // The IsBits parameter serves only to disambiguate tbhe B* types from
+    // the U* types, which otherwise would be identical (and
+    // indistinguishable).
+    template<typename _OperandType, typename _CType, Enums::MemType _memType,
+             vgpr_type _vgprType, int IsBits=0>
+    class HsailDataType
+    {
+      public:
+        typedef _OperandType OperandType;
+        typedef _CType CType;
+        static const Enums::MemType memType = _memType;
+        static const vgpr_type vgprType = _vgprType;
+        static const char *label;
+    };
+
+    typedef HsailDataType<CRegOperandType, bool, Enums::M_U8, VT_32, 1> B1;
+    typedef HsailDataType<SRegOperandType, uint8_t, Enums::M_U8, VT_32, 1> B8;
+
+    typedef HsailDataType<SRegOperandType, uint16_t,
+                          Enums::M_U16, VT_32, 1> B16;
+
+    typedef HsailDataType<SRegOperandType, uint32_t,
+                          Enums::M_U32, VT_32, 1> B32;
+
+    typedef HsailDataType<DRegOperandType, uint64_t,
+                          Enums::M_U64, VT_64, 1> B64;
+
+    typedef HsailDataType<SRegOperandType, int8_t, Enums::M_S8, VT_32> S8;
+    typedef HsailDataType<SRegOperandType, int16_t, Enums::M_S16, VT_32> S16;
+    typedef HsailDataType<SRegOperandType, int32_t, Enums::M_S32, VT_32> S32;
+    typedef HsailDataType<DRegOperandType, int64_t, Enums::M_S64, VT_64> S64;
+
+    typedef HsailDataType<SRegOperandType, uint8_t, Enums::M_U8, VT_32> U8;
+    typedef HsailDataType<SRegOperandType, uint16_t, Enums::M_U16, VT_32> U16;
+    typedef HsailDataType<SRegOperandType, uint32_t, Enums::M_U32, VT_32> U32;
+    typedef HsailDataType<DRegOperandType, uint64_t, Enums::M_U64, VT_64> U64;
+
+    typedef HsailDataType<SRegOperandType, float, Enums::M_F32, VT_32> F32;
+    typedef HsailDataType<DRegOperandType, double, Enums::M_F64, VT_64> F64;
+
+    template<typename DestOperandType, typename SrcOperandType,
+             int NumSrcOperands>
+    class CommonInstBase : public HsailGPUStaticInst
+    {
+      protected:
+        typename DestOperandType::DestOperand dest;
+        typename SrcOperandType::SrcOperand src[NumSrcOperands];
+
+        void
+        generateDisassembly()
+        {
+            disassembly = csprintf("%s%s %s", opcode, opcode_suffix(),
+                                   dest.disassemble());
+
+            for (int i = 0; i < NumSrcOperands; ++i) {
+                disassembly += ",";
+                disassembly += src[i].disassemble();
+            }
+        }
+
+        virtual std::string opcode_suffix() = 0;
+
+      public:
+        CommonInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                       const char *opcode)
+            : HsailGPUStaticInst(obj, opcode)
+        {
+            unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+
+            dest.init(op_offs, obj);
+
+            for (int i = 0; i < NumSrcOperands; ++i) {
+                op_offs = obj->getOperandPtr(ib->operands, i + 1);
+                src[i].init(op_offs, obj);
+            }
+        }
+
+        bool isVectorRegister(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex < NumSrcOperands)
+                return src[operandIndex].isVectorRegister();
+            else
+                return dest.isVectorRegister();
+        }
+        bool isCondRegister(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex < NumSrcOperands)
+                return src[operandIndex].isCondRegister();
+            else
+                return dest.isCondRegister();
+        }
+        bool isScalarRegister(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex < NumSrcOperands)
+                return src[operandIndex].isScalarRegister();
+            else
+                return dest.isScalarRegister();
+        }
+        bool isSrcOperand(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex < NumSrcOperands)
+                return true;
+            return false;
+        }
+
+        bool isDstOperand(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex >= NumSrcOperands)
+                return true;
+            return false;
+        }
+        int getOperandSize(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex < NumSrcOperands)
+                return src[operandIndex].opSize();
+            else
+                return dest.opSize();
+        }
+        int getRegisterIndex(int operandIndex) {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+
+            if (operandIndex < NumSrcOperands)
+                return src[operandIndex].regIndex();
+            else
+                return dest.regIndex();
+        }
+        int numSrcRegOperands() {
+            int operands = 0;
+            for (int i = 0; i < NumSrcOperands; i++) {
+                if (src[i].isVectorRegister() == true) {
+                    operands++;
+                }
+            }
+            return operands;
+        }
+        int numDstRegOperands() { return dest.isVectorRegister(); }
+        int getNumOperands() { return NumSrcOperands + 1; }
+    };
+
+    template<typename DataType, int NumSrcOperands>
+    class ArithInst : public CommonInstBase<typename DataType::OperandType,
+                                            typename DataType::OperandType,
+                                            NumSrcOperands>
+    {
+      public:
+        std::string opcode_suffix() { return csprintf("_%s", DataType::label); }
+
+        ArithInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                  const char *opcode)
+            : CommonInstBase<typename DataType::OperandType,
+                             typename DataType::OperandType,
+                             NumSrcOperands>(ib, obj, opcode)
+        {
+        }
+    };
+
+    template<typename DestOperandType, typename Src0OperandType,
+             typename Src1OperandType, typename Src2OperandType>
+    class ThreeNonUniformSourceInstBase : public HsailGPUStaticInst
+    {
+      protected:
+        typename DestOperandType::DestOperand dest;
+        typename Src0OperandType::SrcOperand  src0;
+        typename Src1OperandType::SrcOperand  src1;
+        typename Src2OperandType::SrcOperand  src2;
+
+        void
+        generateDisassembly()
+        {
+            disassembly = csprintf("%s %s,%s,%s,%s", opcode, dest.disassemble(),
+                                   src0.disassemble(), src1.disassemble(),
+                                   src2.disassemble());
+        }
+
+      public:
+        ThreeNonUniformSourceInstBase(const Brig::BrigInstBase *ib,
+                                      const BrigObject *obj,
+                                      const char *opcode)
+            : HsailGPUStaticInst(obj, opcode)
+        {
+            unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+            dest.init(op_offs, obj);
+
+            op_offs = obj->getOperandPtr(ib->operands, 1);
+            src0.init(op_offs, obj);
+
+            op_offs = obj->getOperandPtr(ib->operands, 2);
+            src1.init(op_offs, obj);
+
+            op_offs = obj->getOperandPtr(ib->operands, 3);
+            src2.init(op_offs, obj);
+        }
+
+        bool isVectorRegister(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (!operandIndex)
+                return src0.isVectorRegister();
+            else if (operandIndex == 1)
+                return src1.isVectorRegister();
+            else if (operandIndex == 2)
+                return src2.isVectorRegister();
+            else
+                return dest.isVectorRegister();
+        }
+        bool isCondRegister(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (!operandIndex)
+                return src0.isCondRegister();
+            else if (operandIndex == 1)
+                return src1.isCondRegister();
+            else if (operandIndex == 2)
+                return src2.isCondRegister();
+            else
+                return dest.isCondRegister();
+        }
+        bool isScalarRegister(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (!operandIndex)
+                return src0.isScalarRegister();
+            else if (operandIndex == 1)
+                return src1.isScalarRegister();
+            else if (operandIndex == 2)
+                return src2.isScalarRegister();
+            else
+                return dest.isScalarRegister();
+        }
+        bool isSrcOperand(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex < 3)
+                return true;
+            else
+                return false;
+        }
+        bool isDstOperand(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex >= 3)
+                return true;
+            else
+                return false;
+        }
+        int getOperandSize(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (!operandIndex)
+                return src0.opSize();
+            else if (operandIndex == 1)
+                return src1.opSize();
+            else if (operandIndex == 2)
+                return src2.opSize();
+            else
+                return dest.opSize();
+        }
+        int getRegisterIndex(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (!operandIndex)
+                return src0.regIndex();
+            else if (operandIndex == 1)
+                return src1.regIndex();
+            else if (operandIndex == 2)
+                return src2.regIndex();
+            else
+                return dest.regIndex();
+        }
+
+        int numSrcRegOperands() {
+            int operands = 0;
+            if (src0.isVectorRegister() == true) {
+                operands++;
+            }
+            if (src1.isVectorRegister() == true) {
+                operands++;
+            }
+            if (src2.isVectorRegister() == true) {
+                operands++;
+            }
+            return operands;
+        }
+        int numDstRegOperands() { return dest.isVectorRegister(); }
+        int getNumOperands() { return 4; }
+    };
+
+    template<typename DestDataType, typename Src0DataType,
+             typename Src1DataType, typename Src2DataType>
+    class ThreeNonUniformSourceInst :
+        public ThreeNonUniformSourceInstBase<typename DestDataType::OperandType,
+                                             typename Src0DataType::OperandType,
+                                             typename Src1DataType::OperandType,
+                                             typename Src2DataType::OperandType>
+    {
+      public:
+        typedef typename DestDataType::CType DestCType;
+        typedef typename Src0DataType::CType Src0CType;
+        typedef typename Src1DataType::CType Src1CType;
+        typedef typename Src2DataType::CType Src2CType;
+
+        ThreeNonUniformSourceInst(const Brig::BrigInstBase *ib,
+                                  const BrigObject *obj, const char *opcode)
+            : ThreeNonUniformSourceInstBase<typename DestDataType::OperandType,
+                                         typename Src0DataType::OperandType,
+                                         typename Src1DataType::OperandType,
+                                         typename Src2DataType::OperandType>(ib,
+                                                                    obj, opcode)
+        {
+        }
+    };
+
+    template<typename DataType>
+    class CmovInst : public ThreeNonUniformSourceInst<DataType, B1,
+                                                      DataType, DataType>
+    {
+      public:
+        CmovInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                 const char *opcode)
+            : ThreeNonUniformSourceInst<DataType, B1, DataType,
+                                        DataType>(ib, obj, opcode)
+        {
+        }
+    };
+
+    template<typename DataType>
+    class ExtractInsertInst : public ThreeNonUniformSourceInst<DataType,
+                                                               DataType, U32,
+                                                               U32>
+    {
+      public:
+        ExtractInsertInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                          const char *opcode)
+            : ThreeNonUniformSourceInst<DataType, DataType, U32,
+                                        U32>(ib, obj, opcode)
+        {
+        }
+    };
+
+    template<typename DestOperandType, typename Src0OperandType,
+             typename Src1OperandType>
+    class TwoNonUniformSourceInstBase : public HsailGPUStaticInst
+    {
+      protected:
+        typename DestOperandType::DestOperand dest;
+        typename Src0OperandType::SrcOperand src0;
+        typename Src1OperandType::SrcOperand src1;
+
+        void
+        generateDisassembly()
+        {
+            disassembly = csprintf("%s %s,%s,%s", opcode, dest.disassemble(),
+                                   src0.disassemble(), src1.disassemble());
+        }
+
+
+      public:
+        TwoNonUniformSourceInstBase(const Brig::BrigInstBase *ib,
+                                    const BrigObject *obj, const char *opcode)
+            : HsailGPUStaticInst(obj, opcode)
+        {
+            unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+            dest.init(op_offs, obj);
+
+            op_offs = obj->getOperandPtr(ib->operands, 1);
+            src0.init(op_offs, obj);
+
+            op_offs = obj->getOperandPtr(ib->operands, 2);
+            src1.init(op_offs, obj);
+        }
+        bool isVectorRegister(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (!operandIndex)
+                return src0.isVectorRegister();
+            else if (operandIndex == 1)
+                return src1.isVectorRegister();
+            else
+                return dest.isVectorRegister();
+        }
+        bool isCondRegister(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (!operandIndex)
+                return src0.isCondRegister();
+            else if (operandIndex == 1)
+                return src1.isCondRegister();
+            else
+                return dest.isCondRegister();
+        }
+        bool isScalarRegister(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (!operandIndex)
+                return src0.isScalarRegister();
+            else if (operandIndex == 1)
+                return src1.isScalarRegister();
+            else
+                return dest.isScalarRegister();
+        }
+        bool isSrcOperand(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex < 2)
+                return true;
+            else
+                return false;
+        }
+        bool isDstOperand(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex >= 2)
+                return true;
+            else
+                return false;
+        }
+        int getOperandSize(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (!operandIndex)
+                return src0.opSize();
+            else if (operandIndex == 1)
+                return src1.opSize();
+            else
+                return dest.opSize();
+        }
+        int getRegisterIndex(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (!operandIndex)
+                return src0.regIndex();
+            else if (operandIndex == 1)
+                return src1.regIndex();
+            else
+                return dest.regIndex();
+        }
+
+        int numSrcRegOperands() {
+            int operands = 0;
+            if (src0.isVectorRegister() == true) {
+                operands++;
+            }
+            if (src1.isVectorRegister() == true) {
+                operands++;
+            }
+            return operands;
+        }
+        int numDstRegOperands() { return dest.isVectorRegister(); }
+        int getNumOperands() { return 3; }
+    };
+
+    template<typename DestDataType, typename Src0DataType,
+             typename Src1DataType>
+    class TwoNonUniformSourceInst :
+        public TwoNonUniformSourceInstBase<typename DestDataType::OperandType,
+                                           typename Src0DataType::OperandType,
+                                           typename Src1DataType::OperandType>
+    {
+      public:
+        typedef typename DestDataType::CType DestCType;
+        typedef typename Src0DataType::CType Src0CType;
+        typedef typename Src1DataType::CType Src1CType;
+
+        TwoNonUniformSourceInst(const Brig::BrigInstBase *ib,
+                                const BrigObject *obj, const char *opcode)
+            : TwoNonUniformSourceInstBase<typename DestDataType::OperandType,
+                                         typename Src0DataType::OperandType,
+                                         typename Src1DataType::OperandType>(ib,
+                                                                    obj, opcode)
+        {
+        }
+    };
+
+    // helper function for ClassInst
+    template<typename T>
+    bool
+    fpclassify(T src0, uint32_t src1)
+    {
+        int fpclass = std::fpclassify(src0);
+
+        if ((src1 & 0x3) && (fpclass == FP_NAN)) {
+            return true;
+        }
+
+        if (src0 <= -0.0) {
+            if ((src1 & 0x4) && fpclass == FP_INFINITE)
+                return true;
+            if ((src1 & 0x8) && fpclass == FP_NORMAL)
+                return true;
+            if ((src1 & 0x10) && fpclass == FP_SUBNORMAL)
+                return true;
+            if ((src1 & 0x20) && fpclass == FP_ZERO)
+                return true;
+        } else {
+            if ((src1 & 0x40) && fpclass == FP_ZERO)
+                return true;
+            if ((src1 & 0x80) && fpclass == FP_SUBNORMAL)
+                return true;
+            if ((src1 & 0x100) && fpclass == FP_NORMAL)
+                return true;
+            if ((src1 & 0x200) && fpclass == FP_INFINITE)
+                return true;
+        }
+        return false;
+    }
+
+    template<typename DataType>
+    class ClassInst : public TwoNonUniformSourceInst<B1, DataType, U32>
+    {
+      public:
+        ClassInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                  const char *opcode)
+            : TwoNonUniformSourceInst<B1, DataType, U32>(ib, obj, opcode)
+        {
+        }
+    };
+
+    template<typename DataType>
+    class ShiftInst : public TwoNonUniformSourceInst<DataType, DataType, U32>
+    {
+      public:
+        ShiftInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                  const char *opcode)
+            : TwoNonUniformSourceInst<DataType, DataType, U32>(ib, obj, opcode)
+        {
+        }
+    };
+
+    // helper function for CmpInst
+    template<typename T>
+    bool
+    compare(T src0, T src1, Brig::BrigCompareOperation cmpOp)
+    {
+        using namespace Brig;
+
+        switch (cmpOp) {
+          case BRIG_COMPARE_EQ:
+          case BRIG_COMPARE_EQU:
+          case BRIG_COMPARE_SEQ:
+          case BRIG_COMPARE_SEQU:
+            return (src0 == src1);
+
+          case BRIG_COMPARE_NE:
+          case BRIG_COMPARE_NEU:
+          case BRIG_COMPARE_SNE:
+          case BRIG_COMPARE_SNEU:
+            return (src0 != src1);
+
+          case BRIG_COMPARE_LT:
+          case BRIG_COMPARE_LTU:
+          case BRIG_COMPARE_SLT:
+          case BRIG_COMPARE_SLTU:
+            return (src0 < src1);
+
+          case BRIG_COMPARE_LE:
+          case BRIG_COMPARE_LEU:
+          case BRIG_COMPARE_SLE:
+          case BRIG_COMPARE_SLEU:
+            return (src0 <= src1);
+
+          case BRIG_COMPARE_GT:
+          case BRIG_COMPARE_GTU:
+          case BRIG_COMPARE_SGT:
+          case BRIG_COMPARE_SGTU:
+            return (src0 > src1);
+
+          case BRIG_COMPARE_GE:
+          case BRIG_COMPARE_GEU:
+          case BRIG_COMPARE_SGE:
+          case BRIG_COMPARE_SGEU:
+            return (src0 >= src1);
+
+          case BRIG_COMPARE_NUM:
+          case BRIG_COMPARE_SNUM:
+            return (src0 == src0) || (src1 == src1);
+
+          case BRIG_COMPARE_NAN:
+          case BRIG_COMPARE_SNAN:
+            return (src0 != src0) || (src1 != src1);
+
+          default:
+            fatal("Bad cmpOp value %d\n", (int)cmpOp);
+        }
+    }
+
+    template<typename T>
+    int32_t
+    firstbit(T src0)
+    {
+        if (!src0)
+            return -1;
+
+        //handle positive and negative numbers
+        T tmp = (src0 < 0) ? (~src0) : (src0);
+
+        //the starting pos is MSB
+        int pos = 8 * sizeof(T) - 1;
+        int cnt = 0;
+
+        //search the first bit set to 1
+        while (!(tmp & (1 << pos))) {
+            ++cnt;
+            --pos;
+        }
+        return cnt;
+    }
+
+    const char* cmpOpToString(Brig::BrigCompareOperation cmpOp);
+
+    template<typename DestOperandType, typename SrcOperandType>
+    class CmpInstBase : public CommonInstBase<DestOperandType, SrcOperandType,
+                                              2>
+    {
+      protected:
+        Brig::BrigCompareOperation cmpOp;
+
+      public:
+        CmpInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                    const char *_opcode)
+            : CommonInstBase<DestOperandType, SrcOperandType, 2>(ib, obj,
+                                                                 _opcode)
+        {
+            assert(ib->base.kind == Brig::BRIG_KIND_INST_CMP);
+            Brig::BrigInstCmp *i = (Brig::BrigInstCmp*)ib;
+            cmpOp = (Brig::BrigCompareOperation)i->compare;
+        }
+    };
+
+    template<typename DestDataType, typename SrcDataType>
+    class CmpInst : public CmpInstBase<typename DestDataType::OperandType,
+                                       typename SrcDataType::OperandType>
+    {
+      public:
+        std::string
+        opcode_suffix()
+        {
+            return csprintf("_%s_%s_%s", cmpOpToString(this->cmpOp),
+                            DestDataType::label, SrcDataType::label);
+        }
+
+        CmpInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                const char *_opcode)
+            : CmpInstBase<typename DestDataType::OperandType,
+                          typename SrcDataType::OperandType>(ib, obj, _opcode)
+        {
+        }
+    };
+
+    template<typename DestDataType, typename SrcDataType>
+    class CvtInst : public CommonInstBase<typename DestDataType::OperandType,
+                                          typename SrcDataType::OperandType, 1>
+    {
+      public:
+        std::string opcode_suffix()
+        {
+            return csprintf("_%s_%s", DestDataType::label, SrcDataType::label);
+        }
+
+        CvtInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                const char *_opcode)
+            : CommonInstBase<typename DestDataType::OperandType,
+                             typename SrcDataType::OperandType,
+                             1>(ib, obj, _opcode)
+        {
+        }
+    };
+
+    class SpecialInstNoSrcNoDest : public HsailGPUStaticInst
+    {
+      public:
+        SpecialInstNoSrcNoDest(const Brig::BrigInstBase *ib,
+                               const BrigObject *obj, const char *_opcode)
+            : HsailGPUStaticInst(obj, _opcode)
+        {
+        }
+
+        bool isVectorRegister(int operandIndex) { return false; }
+        bool isCondRegister(int operandIndex) { return false; }
+        bool isScalarRegister(int operandIndex) { return false; }
+        bool isSrcOperand(int operandIndex) { return false; }
+        bool isDstOperand(int operandIndex) { return false; }
+        int getOperandSize(int operandIndex) { return 0; }
+        int getRegisterIndex(int operandIndex) { return -1; }
+
+        int numSrcRegOperands() { return 0; }
+        int numDstRegOperands() { return 0; }
+        int getNumOperands() { return 0; }
+    };
+
+    template<typename DestOperandType>
+    class SpecialInstNoSrcBase : public HsailGPUStaticInst
+    {
+      protected:
+        typename DestOperandType::DestOperand dest;
+
+        void generateDisassembly()
+        {
+            disassembly = csprintf("%s %s", opcode, dest.disassemble());
+        }
+
+      public:
+        SpecialInstNoSrcBase(const Brig::BrigInstBase *ib,
+                             const BrigObject *obj, const char *_opcode)
+            : HsailGPUStaticInst(obj, _opcode)
+        {
+            unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+            dest.init(op_offs, obj);
+        }
+
+        bool isVectorRegister(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return dest.isVectorRegister();
+        }
+        bool isCondRegister(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return dest.isCondRegister();
+        }
+        bool isScalarRegister(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return dest.isScalarRegister();
+        }
+        bool isSrcOperand(int operandIndex) { return false; }
+        bool isDstOperand(int operandIndex) { return true; }
+        int getOperandSize(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return dest.opSize();
+        }
+        int getRegisterIndex(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return dest.regIndex();
+        }
+        int numSrcRegOperands() { return 0; }
+        int numDstRegOperands() { return dest.isVectorRegister(); }
+        int getNumOperands() { return 1; }
+    };
+
+    template<typename DestDataType>
+    class SpecialInstNoSrc :
+        public SpecialInstNoSrcBase<typename DestDataType::OperandType>
+    {
+      public:
+        typedef typename DestDataType::CType DestCType;
+
+        SpecialInstNoSrc(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                         const char *_opcode)
+            : SpecialInstNoSrcBase<typename DestDataType::OperandType>(ib, obj,
+                                                                       _opcode)
+        {
+        }
+    };
+
+    template<typename DestOperandType>
+    class SpecialInst1SrcBase : public HsailGPUStaticInst
+    {
+      protected:
+        typedef int SrcCType;  // used in execute() template
+
+        typename DestOperandType::DestOperand dest;
+        ImmOperand<SrcCType> src0;
+
+        void
+        generateDisassembly()
+        {
+            disassembly = csprintf("%s %s,%s", opcode, dest.disassemble(),
+                                   src0.disassemble());
+        }
+
+      public:
+        SpecialInst1SrcBase(const Brig::BrigInstBase *ib,
+                            const BrigObject *obj, const char *_opcode)
+            : HsailGPUStaticInst(obj, _opcode)
+        {
+            unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+            dest.init(op_offs, obj);
+
+            op_offs = obj->getOperandPtr(ib->operands, 1);
+            src0.init(op_offs, obj);
+        }
+        bool isVectorRegister(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return dest.isVectorRegister();
+        }
+        bool isCondRegister(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return dest.isCondRegister();
+        }
+        bool isScalarRegister(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return dest.isScalarRegister();
+        }
+        bool isSrcOperand(int operandIndex) { return false; }
+        bool isDstOperand(int operandIndex) { return true; }
+        int getOperandSize(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return dest.opSize();
+        }
+        int getRegisterIndex(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return dest.regIndex();
+        }
+        int numSrcRegOperands() { return 0; }
+        int numDstRegOperands() { return dest.isVectorRegister(); }
+        int getNumOperands() { return 1; }
+    };
+
+    template<typename DestDataType>
+    class SpecialInst1Src :
+        public SpecialInst1SrcBase<typename DestDataType::OperandType>
+    {
+      public:
+        typedef typename DestDataType::CType DestCType;
+
+        SpecialInst1Src(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                        const char *_opcode)
+            : SpecialInst1SrcBase<typename DestDataType::OperandType>(ib, obj,
+                                                                      _opcode)
+        {
+        }
+    };
+
+    class Ret : public SpecialInstNoSrcNoDest
+    {
+      public:
+        typedef SpecialInstNoSrcNoDest Base;
+
+        Ret(const Brig::BrigInstBase *ib, const BrigObject *obj)
+           : Base(ib, obj, "ret")
+        {
+            o_type = Enums::OT_RET;
+        }
+
+        void execute(GPUDynInstPtr gpuDynInst);
+    };
+
+    class Barrier : public SpecialInstNoSrcNoDest
+    {
+      public:
+        typedef SpecialInstNoSrcNoDest Base;
+        uint8_t width;
+
+        Barrier(const Brig::BrigInstBase *ib, const BrigObject *obj)
+            : Base(ib, obj, "barrier")
+        {
+            o_type = Enums::OT_BARRIER;
+            assert(ib->base.kind == Brig::BRIG_KIND_INST_BR);
+            width = (uint8_t)((Brig::BrigInstBr*)ib)->width;
+        }
+
+        void execute(GPUDynInstPtr gpuDynInst);
+    };
+
+    class MemFence : public SpecialInstNoSrcNoDest
+    {
+      public:
+        typedef SpecialInstNoSrcNoDest Base;
+
+        Brig::BrigMemoryOrder memFenceMemOrder;
+        Brig::BrigMemoryScope memFenceScopeSegGroup;
+        Brig::BrigMemoryScope memFenceScopeSegGlobal;
+        Brig::BrigMemoryScope memFenceScopeSegImage;
+
+        MemFence(const Brig::BrigInstBase *ib, const BrigObject *obj)
+            : Base(ib, obj, "memfence")
+        {
+            assert(ib->base.kind == Brig::BRIG_KIND_INST_MEM_FENCE);
+
+            memFenceScopeSegGlobal = (Brig::BrigMemoryScope)
+                ((Brig::BrigInstMemFence*)ib)->globalSegmentMemoryScope;
+
+            memFenceScopeSegGroup = (Brig::BrigMemoryScope)
+                ((Brig::BrigInstMemFence*)ib)->groupSegmentMemoryScope;
+
+            memFenceScopeSegImage = (Brig::BrigMemoryScope)
+                ((Brig::BrigInstMemFence*)ib)->imageSegmentMemoryScope;
+
+            memFenceMemOrder = (Brig::BrigMemoryOrder)
+                ((Brig::BrigInstMemFence*)ib)->memoryOrder;
+
+            // set o_type based on scopes
+            if (memFenceScopeSegGlobal != Brig::BRIG_MEMORY_SCOPE_NONE &&
+                memFenceScopeSegGroup != Brig::BRIG_MEMORY_SCOPE_NONE) {
+                o_type = Enums::OT_BOTH_MEMFENCE;
+            } else if (memFenceScopeSegGlobal != Brig::BRIG_MEMORY_SCOPE_NONE) {
+                o_type = Enums::OT_GLOBAL_MEMFENCE;
+            } else if (memFenceScopeSegGroup != Brig::BRIG_MEMORY_SCOPE_NONE) {
+                o_type = Enums::OT_SHARED_MEMFENCE;
+            } else {
+                fatal("MemFence constructor: bad scope specifiers\n");
+            }
+        }
+
+        void
+        initiateAcc(GPUDynInstPtr gpuDynInst)
+        {
+            Wavefront *wave = gpuDynInst->wavefront();
+            wave->computeUnit->injectGlobalMemFence(gpuDynInst);
+        }
+
+        void
+        execute(GPUDynInstPtr gpuDynInst)
+        {
+            Wavefront *w = gpuDynInst->wavefront();
+            // 2 cases:
+            //   * memfence to a sequentially consistent memory (e.g., LDS).
+            //     These can be handled as no-ops.
+            //   * memfence to a relaxed consistency cache (e.g., Hermes, Viper,
+            //     etc.). We send a packet, tagged with the memory order and
+            //     scope, and let the GPU coalescer handle it.
+
+            if (o_type == Enums::OT_GLOBAL_MEMFENCE ||
+                o_type == Enums::OT_BOTH_MEMFENCE) {
+                gpuDynInst->simdId = w->simdId;
+                gpuDynInst->wfSlotId = w->wfSlotId;
+                gpuDynInst->wfDynId = w->wfDynId;
+                gpuDynInst->kern_id = w->kern_id;
+                gpuDynInst->cu_id = w->computeUnit->cu_id;
+
+                gpuDynInst->memoryOrder =
+                    getGenericMemoryOrder(memFenceMemOrder);
+                gpuDynInst->scope =
+                    getGenericMemoryScope(memFenceScopeSegGlobal);
+                gpuDynInst->useContinuation = false;
+                GlobalMemPipeline* gmp = &(w->computeUnit->globalMemoryPipe);
+                gmp->getGMReqFIFO().push(gpuDynInst);
+
+                w->wr_gm_reqs_in_pipe--;
+                w->rd_gm_reqs_in_pipe--;
+                w->mem_reqs_in_pipe--;
+                w->outstanding_reqs++;
+            } else if (o_type == Enums::OT_SHARED_MEMFENCE) {
+                // no-op
+            } else {
+                fatal("MemFence execute: bad o_type\n");
+            }
+        }
+    };
+
+    class Call : public HsailGPUStaticInst
+    {
+      public:
+        // private helper functions
+        void calcAddr(Wavefront* w, GPUDynInstPtr m);
+
+        void
+        generateDisassembly()
+        {
+            if (dest.disassemble() == "") {
+                disassembly = csprintf("%s %s (%s)", opcode, src0.disassemble(),
+                                       src1.disassemble());
+            } else {
+                disassembly = csprintf("%s %s (%s) (%s)", opcode,
+                                       src0.disassemble(), dest.disassemble(),
+                                       src1.disassemble());
+            }
+        }
+
+        bool
+        isPseudoOp()
+        {
+            std::string func_name = src0.disassemble();
+            if (func_name.find("__gem5_hsail_op") != std::string::npos) {
+                return true;
+            }
+            return false;
+        }
+
+        // member variables
+        ListOperand dest;
+        FunctionRefOperand src0;
+        ListOperand src1;
+        HsailCode *func_ptr;
+
+        // exec function for pseudo instructions mapped on top of call opcode
+        void execPseudoInst(Wavefront *w, GPUDynInstPtr gpuDynInst);
+
+        // user-defined pseudo instructions
+        void MagicPrintLane(Wavefront *w);
+        void MagicPrintLane64(Wavefront *w);
+        void MagicPrintWF32(Wavefront *w);
+        void MagicPrintWF64(Wavefront *w);
+        void MagicPrintWFFloat(Wavefront *w);
+        void MagicSimBreak(Wavefront *w);
+        void MagicPrefixSum(Wavefront *w);
+        void MagicReduction(Wavefront *w);
+        void MagicMaskLower(Wavefront *w);
+        void MagicMaskUpper(Wavefront *w);
+        void MagicJoinWFBar(Wavefront *w);
+        void MagicWaitWFBar(Wavefront *w);
+        void MagicPanic(Wavefront *w);
+
+        void MagicAtomicNRAddGlobalU32Reg(Wavefront *w,
+                                          GPUDynInstPtr gpuDynInst);
+
+        void MagicAtomicNRAddGroupU32Reg(Wavefront *w,
+                                         GPUDynInstPtr gpuDynInst);
+
+        void MagicLoadGlobalU32Reg(Wavefront *w, GPUDynInstPtr gpuDynInst);
+
+        void MagicXactCasLd(Wavefront *w);
+        void MagicMostSigThread(Wavefront *w);
+        void MagicMostSigBroadcast(Wavefront *w);
+
+        void MagicPrintWF32ID(Wavefront *w);
+        void MagicPrintWFID64(Wavefront *w);
+
+        Call(const Brig::BrigInstBase *ib, const BrigObject *obj)
+            : HsailGPUStaticInst(obj, "call")
+        {
+            unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+            dest.init(op_offs, obj);
+            op_offs = obj->getOperandPtr(ib->operands, 1);
+            src0.init(op_offs, obj);
+
+            func_ptr = nullptr;
+            std::string func_name = src0.disassemble();
+            if (!isPseudoOp()) {
+                func_ptr = dynamic_cast<HsailCode*>(obj->
+                                                    getFunction(func_name));
+
+                if (!func_ptr)
+                    fatal("call::exec cannot find function: %s\n", func_name);
+            }
+
+            op_offs = obj->getOperandPtr(ib->operands, 2);
+            src1.init(op_offs, obj);
+        }
+
+        bool isVectorRegister(int operandIndex) { return false; }
+        bool isCondRegister(int operandIndex) { return false; }
+        bool isScalarRegister(int operandIndex) { return false; }
+        bool isSrcOperand(int operandIndex) { return false; }
+        bool isDstOperand(int operandIndex) { return false; }
+        int  getOperandSize(int operandIndex) { return 0; }
+        int  getRegisterIndex(int operandIndex) { return -1; }
+
+        void
+        execute(GPUDynInstPtr gpuDynInst)
+        {
+            Wavefront *w = gpuDynInst->wavefront();
+
+            std::string func_name = src0.disassemble();
+            if (isPseudoOp()) {
+                execPseudoInst(w, gpuDynInst);
+            } else {
+                fatal("Native HSAIL functions are not yet implemented: %s\n",
+                      func_name);
+            }
+        }
+        int numSrcRegOperands() { return 0; }
+        int numDstRegOperands() { return 0; }
+        int getNumOperands() { return 2; }
+    };
+
+    template<typename T> T heynot(T arg) { return ~arg; }
+    template<> inline bool heynot<bool>(bool arg) { return !arg; }
+} // namespace HsailISA
+
+#endif // __ARCH_HSAIL_INSTS_DECL_HH__
diff --git a/src/arch/hsail/insts/gpu_static_inst.cc b/src/arch/hsail/insts/gpu_static_inst.cc
new file mode 100644
index 000000000..bbaeb13e6
--- /dev/null
+++ b/src/arch/hsail/insts/gpu_static_inst.cc
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#include "arch/hsail/insts/gpu_static_inst.hh"
+
+#include "gpu-compute/brig_object.hh"
+
+namespace HsailISA
+{
+    HsailGPUStaticInst::HsailGPUStaticInst(const BrigObject *obj,
+                                           const std::string &opcode)
+        : GPUStaticInst(opcode), hsailCode(obj->currentCode)
+    {
+    }
+
+    void
+    HsailGPUStaticInst::generateDisassembly()
+    {
+        disassembly = opcode;
+    }
+
+    const std::string&
+    HsailGPUStaticInst::disassemble()
+    {
+        if (disassembly.empty()) {
+            generateDisassembly();
+            assert(!disassembly.empty());
+        }
+
+        return disassembly;
+    }
+} // namespace HsailISA
diff --git a/src/arch/hsail/insts/gpu_static_inst.hh b/src/arch/hsail/insts/gpu_static_inst.hh
new file mode 100644
index 000000000..29aab1f70
--- /dev/null
+++ b/src/arch/hsail/insts/gpu_static_inst.hh
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#ifndef __ARCH_HSAIL_INSTS_GPU_STATIC_INST_HH__
+#define __ARCH_HSAIL_INSTS_GPU_STATIC_INST_HH__
+
+/*
+ * @file gpu_static_inst.hh
+ *
+ * Defines the base class representing HSAIL GPU static instructions.
+ */
+
+#include "gpu-compute/gpu_static_inst.hh"
+
+class BrigObject;
+class HsailCode;
+
+namespace HsailISA
+{
+    class HsailGPUStaticInst : public GPUStaticInst
+    {
+      public:
+        HsailGPUStaticInst(const BrigObject *obj, const std::string &opcode);
+        void generateDisassembly();
+        const std::string &disassemble();
+        uint32_t instSize() { return 4; }
+
+      protected:
+        HsailCode *hsailCode;
+    };
+} // namespace HsailISA
+
+#endif // __ARCH_HSAIL_INSTS_GPU_STATIC_INST_HH__
diff --git a/src/arch/hsail/insts/main.cc b/src/arch/hsail/insts/main.cc
new file mode 100644
index 000000000..4e70bf46a
--- /dev/null
+++ b/src/arch/hsail/insts/main.cc
@@ -0,0 +1,208 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#include "arch/hsail/insts/decl.hh"
+#include "debug/GPUExec.hh"
+#include "gpu-compute/dispatcher.hh"
+#include "gpu-compute/simple_pool_manager.hh"
+
+namespace HsailISA
+{
+    template<> const char *B1::label = "b1";
+    template<> const char *B8::label = "b8";
+    template<> const char *B16::label = "b16";
+    template<> const char *B32::label = "b32";
+    template<> const char *B64::label = "b64";
+
+    template<> const char *S8::label = "s8";
+    template<> const char *S16::label = "s16";
+    template<> const char *S32::label = "s32";
+    template<> const char *S64::label = "s64";
+
+    template<> const char *U8::label = "u8";
+    template<> const char *U16::label = "u16";
+    template<> const char *U32::label = "u32";
+    template<> const char *U64::label = "u64";
+
+    template<> const char *F32::label = "f32";
+    template<> const char *F64::label = "f64";
+
+    const char*
+    cmpOpToString(Brig::BrigCompareOperation cmpOp)
+    {
+        using namespace Brig;
+
+        switch (cmpOp) {
+          case BRIG_COMPARE_EQ:
+            return "eq";
+          case BRIG_COMPARE_NE:
+            return "ne";
+          case BRIG_COMPARE_LT:
+            return "lt";
+          case BRIG_COMPARE_LE:
+            return "le";
+          case BRIG_COMPARE_GT:
+            return "gt";
+          case BRIG_COMPARE_GE:
+            return "ge";
+          case BRIG_COMPARE_EQU:
+            return "equ";
+          case BRIG_COMPARE_NEU:
+            return "neu";
+          case BRIG_COMPARE_LTU:
+            return "ltu";
+          case BRIG_COMPARE_LEU:
+            return "leu";
+          case BRIG_COMPARE_GTU:
+            return "gtu";
+          case BRIG_COMPARE_GEU:
+            return "geu";
+          case BRIG_COMPARE_NUM:
+            return "num";
+          case BRIG_COMPARE_NAN:
+            return "nan";
+          case BRIG_COMPARE_SEQ:
+            return "seq";
+          case BRIG_COMPARE_SNE:
+            return "sne";
+          case BRIG_COMPARE_SLT:
+            return "slt";
+          case BRIG_COMPARE_SLE:
+            return "sle";
+          case BRIG_COMPARE_SGT:
+            return "sgt";
+          case BRIG_COMPARE_SGE:
+            return "sge";
+          case BRIG_COMPARE_SGEU:
+            return "sgeu";
+          case BRIG_COMPARE_SEQU:
+            return "sequ";
+          case BRIG_COMPARE_SNEU:
+            return "sneu";
+          case BRIG_COMPARE_SLTU:
+            return "sltu";
+          case BRIG_COMPARE_SLEU:
+            return "sleu";
+          case BRIG_COMPARE_SNUM:
+            return "snum";
+          case BRIG_COMPARE_SNAN:
+            return "snan";
+          case BRIG_COMPARE_SGTU:
+            return "sgtu";
+          default:
+            return "unknown";
+        }
+    }
+
+    void
+    Ret::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *w = gpuDynInst->wavefront();
+
+        const VectorMask &mask = w->get_pred();
+
+        // mask off completed work-items
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (mask[lane]) {
+                w->init_mask[lane] = 0;
+            }
+
+        }
+
+        // delete extra instructions fetched for completed work-items
+        w->instructionBuffer.erase(w->instructionBuffer.begin() + 1,
+                                   w->instructionBuffer.end());
+        if (w->pendingFetch) {
+            w->dropFetch = true;
+        }
+
+        // if all work-items have completed, then wave-front is done
+        if (w->init_mask.none()) {
+            w->status = Wavefront::S_STOPPED;
+
+            int32_t refCount = w->computeUnit->getLds().
+                                   decreaseRefCounter(w->dispatchid, w->wg_id);
+
+            DPRINTF(GPUExec, "CU%d: decrease ref ctr WG[%d] to [%d]\n",
+                            w->computeUnit->cu_id, w->wg_id, refCount);
+
+            // free the vector registers of the completed wavefront
+            w->computeUnit->vectorRegsReserved[w->simdId] -=
+                w->reservedVectorRegs;
+
+            assert(w->computeUnit->vectorRegsReserved[w->simdId] >= 0);
+
+            uint32_t endIndex = (w->startVgprIndex +
+                                 w->reservedVectorRegs - 1) %
+                w->computeUnit->vrf[w->simdId]->numRegs();
+
+            w->computeUnit->vrf[w->simdId]->manager->
+                freeRegion(w->startVgprIndex, endIndex);
+
+            w->reservedVectorRegs = 0;
+            w->startVgprIndex = 0;
+            w->computeUnit->completedWfs++;
+
+            DPRINTF(GPUExec, "Doing return for CU%d: WF[%d][%d][%d]\n",
+                    w->computeUnit->cu_id, w->simdId, w->wfSlotId, w->wfDynId);
+
+            if (!refCount) {
+                // Notify Memory System of Kernel Completion
+                // Kernel End = isKernel + isRelease
+                w->status = Wavefront::S_RETURNING;
+                GPUDynInstPtr local_mempacket = gpuDynInst;
+                local_mempacket->memoryOrder = Enums::MEMORY_ORDER_SC_RELEASE;
+                local_mempacket->scope = Enums::MEMORY_SCOPE_SYSTEM;
+                local_mempacket->useContinuation = false;
+                local_mempacket->simdId = w->simdId;
+                local_mempacket->wfSlotId = w->wfSlotId;
+                local_mempacket->wfDynId = w->wfDynId;
+                w->computeUnit->injectGlobalMemFence(local_mempacket, true);
+            } else {
+                w->computeUnit->shader->dispatcher->scheduleDispatch();
+            }
+        }
+    }
+
+    void
+    Barrier::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *w = gpuDynInst->wavefront();
+
+        assert(w->barrier_cnt == w->old_barrier_cnt);
+        w->barrier_cnt = w->old_barrier_cnt + 1;
+        w->stalledAtBarrier = true;
+    }
+} // namespace HsailISA
diff --git a/src/arch/hsail/insts/mem.cc b/src/arch/hsail/insts/mem.cc
new file mode 100644
index 000000000..97d4c902b
--- /dev/null
+++ b/src/arch/hsail/insts/mem.cc
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#include "arch/hsail/insts/mem.hh"
+
+#include "arch/hsail/Brig.h"
+#include "enums/OpType.hh"
+
+using namespace Brig;
+
+namespace HsailISA
+{
+    const char* atomicOpToString(BrigAtomicOperation brigOp);
+
+    Enums::MemOpType
+    brigAtomicToMemOpType(BrigOpcode brigOpCode, BrigAtomicOperation brigOp)
+    {
+        if (brigOpCode == Brig::BRIG_OPCODE_ATOMIC) {
+            switch (brigOp) {
+              case BRIG_ATOMIC_AND:
+                return Enums::MO_AAND;
+              case BRIG_ATOMIC_OR:
+                return Enums::MO_AOR;
+              case BRIG_ATOMIC_XOR:
+                return Enums::MO_AXOR;
+              case BRIG_ATOMIC_CAS:
+                return Enums::MO_ACAS;
+              case BRIG_ATOMIC_EXCH:
+                return Enums::MO_AEXCH;
+              case BRIG_ATOMIC_ADD:
+                return Enums::MO_AADD;
+              case BRIG_ATOMIC_WRAPINC:
+                return Enums::MO_AINC;
+              case BRIG_ATOMIC_WRAPDEC:
+                return Enums::MO_ADEC;
+              case BRIG_ATOMIC_MIN:
+                return Enums::MO_AMIN;
+              case BRIG_ATOMIC_MAX:
+                return Enums::MO_AMAX;
+              case BRIG_ATOMIC_SUB:
+                return Enums::MO_ASUB;
+              default:
+                fatal("Bad BrigAtomicOperation code %d\n", brigOp);
+            }
+        } else if (brigOpCode == Brig::BRIG_OPCODE_ATOMICNORET) {
+            switch (brigOp) {
+              case BRIG_ATOMIC_AND:
+                  return Enums::MO_ANRAND;
+              case BRIG_ATOMIC_OR:
+                  return Enums::MO_ANROR;
+              case BRIG_ATOMIC_XOR:
+                  return Enums::MO_ANRXOR;
+              case BRIG_ATOMIC_CAS:
+                  return Enums::MO_ANRCAS;
+              case BRIG_ATOMIC_EXCH:
+                  return Enums::MO_ANREXCH;
+              case BRIG_ATOMIC_ADD:
+                  return Enums::MO_ANRADD;
+              case BRIG_ATOMIC_WRAPINC:
+                  return Enums::MO_ANRINC;
+              case BRIG_ATOMIC_WRAPDEC:
+                  return Enums::MO_ANRDEC;
+              case BRIG_ATOMIC_MIN:
+                  return Enums::MO_ANRMIN;
+              case BRIG_ATOMIC_MAX:
+                  return Enums::MO_ANRMAX;
+              case BRIG_ATOMIC_SUB:
+                  return Enums::MO_ANRSUB;
+              default:
+                fatal("Bad BrigAtomicOperation code %d\n", brigOp);
+            }
+        } else {
+            fatal("Bad BrigAtomicOpcode %d\n", brigOpCode);
+        }
+    }
+
+    const char*
+    atomicOpToString(BrigAtomicOperation brigOp)
+    {
+        switch (brigOp) {
+          case BRIG_ATOMIC_AND:
+            return "and";
+          case BRIG_ATOMIC_OR:
+            return "or";
+          case BRIG_ATOMIC_XOR:
+            return "xor";
+          case BRIG_ATOMIC_CAS:
+            return "cas";
+          case BRIG_ATOMIC_EXCH:
+            return "exch";
+          case BRIG_ATOMIC_ADD:
+            return "add";
+          case BRIG_ATOMIC_WRAPINC:
+            return "inc";
+          case BRIG_ATOMIC_WRAPDEC:
+            return "dec";
+          case BRIG_ATOMIC_MIN:
+            return "min";
+          case BRIG_ATOMIC_MAX:
+            return "max";
+          case BRIG_ATOMIC_SUB:
+            return "sub";
+          default:
+            return "unknown";
+        }
+    }
+} // namespace HsailISA
diff --git a/src/arch/hsail/insts/mem.hh b/src/arch/hsail/insts/mem.hh
new file mode 100644
index 000000000..d3ce76dee
--- /dev/null
+++ b/src/arch/hsail/insts/mem.hh
@@ -0,0 +1,1629 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#ifndef __ARCH_HSAIL_INSTS_MEM_HH__
+#define __ARCH_HSAIL_INSTS_MEM_HH__
+
+#include "arch/hsail/insts/decl.hh"
+#include "arch/hsail/insts/gpu_static_inst.hh"
+#include "arch/hsail/operand.hh"
+
+namespace HsailISA
+{
+    class MemInst
+    {
+      public:
+        MemInst() : size(0), addr_operand(nullptr) { }
+
+        MemInst(Enums::MemType m_type)
+        {
+            if (m_type == Enums::M_U64 ||
+                m_type == Enums::M_S64 ||
+                m_type == Enums::M_F64) {
+                size = 8;
+            } else if (m_type == Enums::M_U32 ||
+                       m_type == Enums::M_S32 ||
+                       m_type == Enums::M_F32) {
+                size = 4;
+            } else if (m_type == Enums::M_U16 ||
+                       m_type == Enums::M_S16 ||
+                       m_type == Enums::M_F16) {
+                size = 2;
+            } else {
+                size = 1;
+            }
+
+            addr_operand = nullptr;
+        }
+
+        void
+        init_addr(AddrOperandBase *_addr_operand)
+        {
+            addr_operand = _addr_operand;
+        }
+
+      private:
+        int size;
+        AddrOperandBase *addr_operand;
+
+      public:
+        int getMemOperandSize() { return size; }
+        AddrOperandBase *getAddressOperand() { return addr_operand; }
+    };
+
+    template<typename DestOperandType, typename AddrOperandType>
+    class LdaInstBase : public HsailGPUStaticInst
+    {
+      public:
+        typename DestOperandType::DestOperand dest;
+        AddrOperandType addr;
+
+        LdaInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                    const char *_opcode)
+           : HsailGPUStaticInst(obj, _opcode)
+        {
+            using namespace Brig;
+
+            unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+            dest.init(op_offs, obj);
+            op_offs = obj->getOperandPtr(ib->operands, 1);
+            addr.init(op_offs, obj);
+        }
+
+        int numSrcRegOperands() { return(this->addr.isVectorRegister()); }
+        int numDstRegOperands() { return dest.isVectorRegister(); }
+        bool isVectorRegister(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return((operandIndex == 0) ? dest.isVectorRegister() :
+                   this->addr.isVectorRegister());
+        }
+        bool isCondRegister(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return((operandIndex == 0) ? dest.isCondRegister() :
+                   this->addr.isCondRegister());
+        }
+        bool isScalarRegister(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return((operandIndex == 0) ? dest.isScalarRegister() :
+                   this->addr.isScalarRegister());
+        }
+        bool isSrcOperand(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex > 0)
+                return(this->addr.isVectorRegister());
+            return false;
+        }
+        bool isDstOperand(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return(operandIndex == 0);
+        }
+        int getOperandSize(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return((operandIndex == 0) ? dest.opSize() :
+                   this->addr.opSize());
+        }
+        int getRegisterIndex(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return((operandIndex == 0) ? dest.regIndex() :
+                   this->addr.regIndex());
+        }
+        int getNumOperands()
+        {
+            if (this->addr.isVectorRegister())
+                return 2;
+            return 1;
+        }
+    };
+
+    template<typename DestDataType, typename AddrOperandType>
+    class LdaInst :
+        public LdaInstBase<typename DestDataType::OperandType, AddrOperandType>,
+        public MemInst
+    {
+      public:
+        void generateDisassembly();
+
+        LdaInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                        const char *_opcode)
+            : LdaInstBase<typename DestDataType::OperandType,
+                          AddrOperandType>(ib, obj, _opcode)
+        {
+            init_addr(&this->addr);
+        }
+
+        void execute(GPUDynInstPtr gpuDynInst);
+    };
+
+    template<typename DataType>
+    GPUStaticInst*
+    decodeLda(const Brig::BrigInstBase *ib, const BrigObject *obj)
+    {
+        unsigned op_offs = obj->getOperandPtr(ib->operands, 1);
+        BrigRegOperandInfo regDataType = findRegDataType(op_offs, obj);
+
+        if (regDataType.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) {
+            return new LdaInst<DataType, NoRegAddrOperand>(ib, obj, "ldas");
+        } else if (regDataType.kind == Brig::BRIG_KIND_OPERAND_REGISTER) {
+            // V2/V4 not allowed
+            switch (regDataType.regKind) {
+              case Brig::BRIG_REGISTER_KIND_SINGLE:
+                return new LdaInst<DataType, SRegAddrOperand>(ib, obj, "ldas");
+              case Brig::BRIG_REGISTER_KIND_DOUBLE:
+                return new LdaInst<DataType, DRegAddrOperand>(ib, obj, "ldas");
+              default:
+                fatal("Bad ldas register operand type %d\n", regDataType.type);
+            }
+        } else {
+            fatal("Bad ldas register operand kind %d\n", regDataType.kind);
+        }
+    }
+
+    template<typename MemOperandType, typename DestOperandType,
+             typename AddrOperandType>
+    class LdInstBase : public HsailGPUStaticInst
+    {
+      public:
+        Brig::BrigWidth8_t width;
+        typename DestOperandType::DestOperand dest;
+        AddrOperandType addr;
+
+        Brig::BrigSegment segment;
+        Brig::BrigMemoryOrder memoryOrder;
+        Brig::BrigMemoryScope memoryScope;
+        unsigned int equivClass;
+        bool isArgLoad()
+        {
+            return segment == Brig::BRIG_SEGMENT_KERNARG ||
+                   segment == Brig::BRIG_SEGMENT_ARG;
+        }
+        void
+        initLd(const Brig::BrigInstBase *ib, const BrigObject *obj,
+               const char *_opcode)
+        {
+            using namespace Brig;
+
+            const BrigInstMem *ldst = (const BrigInstMem*)ib;
+
+            segment = (BrigSegment)ldst->segment;
+            memoryOrder = BRIG_MEMORY_ORDER_NONE;
+            memoryScope = BRIG_MEMORY_SCOPE_NONE;
+            equivClass = ldst->equivClass;
+
+            switch (segment) {
+              case BRIG_SEGMENT_GLOBAL:
+                o_type = Enums::OT_GLOBAL_READ;
+                break;
+
+              case BRIG_SEGMENT_GROUP:
+                o_type = Enums::OT_SHARED_READ;
+                break;
+
+              case BRIG_SEGMENT_PRIVATE:
+                o_type = Enums::OT_PRIVATE_READ;
+                break;
+
+              case BRIG_SEGMENT_READONLY:
+                o_type = Enums::OT_READONLY_READ;
+                break;
+
+              case BRIG_SEGMENT_SPILL:
+                o_type = Enums::OT_SPILL_READ;
+                break;
+
+              case BRIG_SEGMENT_FLAT:
+                o_type = Enums::OT_FLAT_READ;
+                break;
+
+              case BRIG_SEGMENT_KERNARG:
+                o_type = Enums::OT_KERN_READ;
+                break;
+
+              case BRIG_SEGMENT_ARG:
+                o_type = Enums::OT_ARG;
+                break;
+
+              default:
+                panic("Ld: segment %d not supported\n", segment);
+            }
+
+            width = ldst->width;
+            unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+            const Brig::BrigOperand *brigOp = obj->getOperand(op_offs);
+            if (brigOp->kind == BRIG_KIND_OPERAND_REGISTER)
+                dest.init(op_offs, obj);
+
+            op_offs = obj->getOperandPtr(ib->operands, 1);
+            addr.init(op_offs, obj);
+        }
+
+        void
+        initAtomicLd(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                     const char *_opcode)
+        {
+            using namespace Brig;
+
+            const BrigInstAtomic *at = (const BrigInstAtomic*)ib;
+
+            segment = (BrigSegment)at->segment;
+            memoryOrder = (BrigMemoryOrder)at->memoryOrder;
+            memoryScope = (BrigMemoryScope)at->memoryScope;
+            equivClass = 0;
+
+            switch (segment) {
+              case BRIG_SEGMENT_GLOBAL:
+                o_type = Enums::OT_GLOBAL_READ;
+                break;
+
+              case BRIG_SEGMENT_GROUP:
+                o_type = Enums::OT_SHARED_READ;
+                break;
+
+              case BRIG_SEGMENT_PRIVATE:
+                o_type = Enums::OT_PRIVATE_READ;
+                break;
+
+              case BRIG_SEGMENT_READONLY:
+                o_type = Enums::OT_READONLY_READ;
+                break;
+
+              case BRIG_SEGMENT_SPILL:
+                o_type = Enums::OT_SPILL_READ;
+                break;
+
+              case BRIG_SEGMENT_FLAT:
+                o_type = Enums::OT_FLAT_READ;
+                break;
+
+              case BRIG_SEGMENT_KERNARG:
+                o_type = Enums::OT_KERN_READ;
+                break;
+
+              case BRIG_SEGMENT_ARG:
+                o_type = Enums::OT_ARG;
+                break;
+
+              default:
+                panic("Ld: segment %d not supported\n", segment);
+            }
+
+            width = BRIG_WIDTH_1;
+            unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+            const Brig::BrigOperand *brigOp = obj->getOperand(op_offs);
+
+            if (brigOp->kind == BRIG_KIND_OPERAND_REGISTER)
+                dest.init(op_offs, obj);
+
+            op_offs = obj->getOperandPtr(ib->operands,1);
+            addr.init(op_offs, obj);
+        }
+
+        LdInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                   const char *_opcode)
+           : HsailGPUStaticInst(obj, _opcode)
+        {
+            using namespace Brig;
+
+            if (ib->opcode == BRIG_OPCODE_LD) {
+                initLd(ib, obj, _opcode);
+            } else {
+                initAtomicLd(ib, obj, _opcode);
+            }
+        }
+
+        int numSrcRegOperands() { return(this->addr.isVectorRegister()); }
+        int numDstRegOperands() { return dest.isVectorRegister(); }
+        int getNumOperands()
+        {
+            if (this->addr.isVectorRegister())
+                return 2;
+            else
+                return 1;
+        }
+        bool isVectorRegister(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return((operandIndex == 0) ? dest.isVectorRegister() :
+                   this->addr.isVectorRegister());
+        }
+        bool isCondRegister(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return((operandIndex == 0) ? dest.isCondRegister() :
+                   this->addr.isCondRegister());
+        }
+        bool isScalarRegister(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return((operandIndex == 0) ? dest.isScalarRegister() :
+                   this->addr.isScalarRegister());
+        }
+        bool isSrcOperand(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex > 0)
+                return(this->addr.isVectorRegister());
+            return false;
+        }
+        bool isDstOperand(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return(operandIndex == 0);
+        }
+        int getOperandSize(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return((operandIndex == 0) ? dest.opSize() :
+                   this->addr.opSize());
+        }
+        int getRegisterIndex(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return((operandIndex == 0) ? dest.regIndex() :
+                   this->addr.regIndex());
+        }
+    };
+
+    template<typename MemDataType, typename DestDataType,
+             typename AddrOperandType>
+    class LdInst :
+        public LdInstBase<typename MemDataType::CType,
+                          typename DestDataType::OperandType, AddrOperandType>,
+        public MemInst
+    {
+        typename DestDataType::OperandType::DestOperand dest_vect[4];
+        uint16_t num_dest_operands;
+        void generateDisassembly();
+
+      public:
+        LdInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
+               const char *_opcode)
+            : LdInstBase<typename MemDataType::CType,
+                         typename DestDataType::OperandType,
+                         AddrOperandType>(ib, obj, _opcode),
+              MemInst(MemDataType::memType)
+        {
+            init_addr(&this->addr);
+
+            unsigned op_offs = obj->getOperandPtr(ib->operands,0);
+            const Brig::BrigOperand *brigOp = obj->getOperand(op_offs);
+
+            if (brigOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST) {
+                const Brig::BrigOperandOperandList *brigRegVecOp =
+                    (const Brig::BrigOperandOperandList*)brigOp;
+
+                num_dest_operands =
+                    *((unsigned*)obj->getData(brigRegVecOp->elements)) / 4;
+
+                assert(num_dest_operands <= 4);
+            } else {
+                num_dest_operands = 1;
+            }
+
+            if (num_dest_operands > 1) {
+                assert(brigOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST);
+
+                for (int i = 0; i < num_dest_operands; ++i) {
+                    dest_vect[i].init_from_vect(op_offs, obj, i);
+                }
+            }
+        }
+
+        void
+        initiateAcc(GPUDynInstPtr gpuDynInst) override
+        {
+            typedef typename MemDataType::CType c0;
+
+            gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
+
+            if (num_dest_operands > 1) {
+                for (int i = 0; i < VSZ; ++i)
+                    if (gpuDynInst->exec_mask[i])
+                        gpuDynInst->statusVector.push_back(num_dest_operands);
+                    else
+                        gpuDynInst->statusVector.push_back(0);
+            }
+
+            for (int k = 0; k < num_dest_operands; ++k) {
+
+                c0 *d = &((c0*)gpuDynInst->d_data)[k * VSZ];
+
+                for (int i = 0; i < VSZ; ++i) {
+                    if (gpuDynInst->exec_mask[i]) {
+                        Addr vaddr = gpuDynInst->addr[i] + k * sizeof(c0);
+
+                        if (isLocalMem()) {
+                            // load from shared memory
+                            *d = gpuDynInst->wavefront()->ldsChunk->
+                                read<c0>(vaddr);
+                        } else {
+                            Request *req = new Request(0, vaddr, sizeof(c0), 0,
+                                          gpuDynInst->computeUnit()->masterId(),
+                                          0, gpuDynInst->wfDynId, i);
+
+                            gpuDynInst->setRequestFlags(req);
+                            PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
+                            pkt->dataStatic(d);
+
+                            if (gpuDynInst->computeUnit()->shader->
+                                separate_acquire_release &&
+                                gpuDynInst->memoryOrder ==
+                                Enums::MEMORY_ORDER_SC_ACQUIRE) {
+                                // if this load has acquire semantics,
+                                // set the response continuation function
+                                // to perform an Acquire request
+                                gpuDynInst->execContinuation =
+                                    &GPUStaticInst::execLdAcq;
+
+                                gpuDynInst->useContinuation = true;
+                            } else {
+                                // the request will be finished when
+                                // the load completes
+                                gpuDynInst->useContinuation = false;
+                            }
+                            // translation is performed in sendRequest()
+                            gpuDynInst->computeUnit()->sendRequest(gpuDynInst,
+                                                                   i, pkt);
+                        }
+                    }
+                    ++d;
+                }
+            }
+
+            gpuDynInst->updateStats();
+        }
+
+      private:
+        void
+        execLdAcq(GPUDynInstPtr gpuDynInst) override
+        {
+            // after the load has complete and if the load has acquire
+            // semantics, issue an acquire request.
+            if (!isLocalMem()) {
+                if (gpuDynInst->computeUnit()->shader->separate_acquire_release
+                    && gpuDynInst->memoryOrder ==
+                    Enums::MEMORY_ORDER_SC_ACQUIRE) {
+                    gpuDynInst->statusBitVector = VectorMask(1);
+                    gpuDynInst->useContinuation = false;
+                    // create request
+                    Request *req = new Request(0, 0, 0, 0,
+                                  gpuDynInst->computeUnit()->masterId(),
+                                  0, gpuDynInst->wfDynId, -1);
+                    req->setFlags(Request::ACQUIRE);
+                    gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req);
+                }
+            }
+        }
+
+      public:
+        bool
+        isLocalMem() const override
+        {
+            return this->segment == Brig::BRIG_SEGMENT_GROUP;
+        }
+
+        bool isVectorRegister(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if ((num_dest_operands != getNumOperands()) &&
+                (operandIndex == (getNumOperands()-1)))
+                return(this->addr.isVectorRegister());
+            if (num_dest_operands > 1) {
+                return dest_vect[operandIndex].isVectorRegister();
+            }
+            else if (num_dest_operands == 1) {
+                return LdInstBase<typename MemDataType::CType,
+                       typename DestDataType::OperandType,
+                       AddrOperandType>::dest.isVectorRegister();
+            }
+            return false;
+        }
+        bool isCondRegister(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if ((num_dest_operands != getNumOperands()) &&
+                (operandIndex == (getNumOperands()-1)))
+                return(this->addr.isCondRegister());
+            if (num_dest_operands > 1)
+                return dest_vect[operandIndex].isCondRegister();
+            else if (num_dest_operands == 1)
+                return LdInstBase<typename MemDataType::CType,
+                       typename DestDataType::OperandType,
+                       AddrOperandType>::dest.isCondRegister();
+            return false;
+        }
+        bool isScalarRegister(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if ((num_dest_operands != getNumOperands()) &&
+                (operandIndex == (getNumOperands()-1)))
+                return(this->addr.isScalarRegister());
+            if (num_dest_operands > 1)
+                return dest_vect[operandIndex].isScalarRegister();
+            else if (num_dest_operands == 1)
+                return LdInstBase<typename MemDataType::CType,
+                       typename DestDataType::OperandType,
+                       AddrOperandType>::dest.isScalarRegister();
+            return false;
+        }
+        bool isSrcOperand(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if ((num_dest_operands != getNumOperands()) &&
+                (operandIndex == (getNumOperands()-1)))
+                return(this->addr.isVectorRegister());
+            return false;
+        }
+        bool isDstOperand(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if ((num_dest_operands != getNumOperands()) &&
+                (operandIndex == (getNumOperands()-1)))
+                return false;
+            return true;
+        }
+        int getOperandSize(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if ((num_dest_operands != getNumOperands()) &&
+                (operandIndex == (getNumOperands()-1)))
+                return(this->addr.opSize());
+            if (num_dest_operands > 1)
+                return(dest_vect[operandIndex].opSize());
+            else if (num_dest_operands == 1)
+                return(LdInstBase<typename MemDataType::CType,
+                       typename DestDataType::OperandType,
+                       AddrOperandType>::dest.opSize());
+            return 0;
+        }
+        int getRegisterIndex(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if ((num_dest_operands != getNumOperands()) &&
+                (operandIndex == (getNumOperands()-1)))
+                return(this->addr.regIndex());
+            if (num_dest_operands > 1)
+                return(dest_vect[operandIndex].regIndex());
+            else if (num_dest_operands == 1)
+                return(LdInstBase<typename MemDataType::CType,
+                       typename DestDataType::OperandType,
+                       AddrOperandType>::dest.regIndex());
+            return -1;
+        }
+        int getNumOperands()
+        {
+            if (this->addr.isVectorRegister() || this->addr.isScalarRegister())
+                return(num_dest_operands+1);
+            else
+                return(num_dest_operands);
+        }
+        void execute(GPUDynInstPtr gpuDynInst);
+    };
+
+    template<typename MemDT, typename DestDT>
+    GPUStaticInst*
+    decodeLd2(const Brig::BrigInstBase *ib, const BrigObject *obj)
+    {
+        unsigned op_offs = obj->getOperandPtr(ib->operands,1);
+        BrigRegOperandInfo tmp = findRegDataType(op_offs, obj);
+
+        if (tmp.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) {
+            return new LdInst<MemDT, DestDT, NoRegAddrOperand>(ib, obj, "ld");
+        } else if (tmp.kind == Brig::BRIG_KIND_OPERAND_REGISTER ||
+                   tmp.kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST) {
+            switch (tmp.regKind) {
+              case Brig::BRIG_REGISTER_KIND_SINGLE:
+                return new LdInst<MemDT, DestDT,
+                                  SRegAddrOperand>(ib, obj, "ld");
+              case Brig::BRIG_REGISTER_KIND_DOUBLE:
+                return new LdInst<MemDT, DestDT,
+                                  DRegAddrOperand>(ib, obj, "ld");
+              default:
+                fatal("Bad ld register operand type %d\n", tmp.regKind);
+            }
+        } else {
+            fatal("Bad ld register operand kind %d\n", tmp.kind);
+        }
+    }
+
+    template<typename MemDT>
+    GPUStaticInst*
+    decodeLd(const Brig::BrigInstBase *ib, const BrigObject *obj)
+    {
+        unsigned op_offs = obj->getOperandPtr(ib->operands,0);
+        BrigRegOperandInfo dest = findRegDataType(op_offs, obj);
+
+        assert(dest.kind == Brig::BRIG_KIND_OPERAND_REGISTER ||
+               dest.kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST);
+        switch(dest.regKind) {
+          case Brig::BRIG_REGISTER_KIND_SINGLE:
+            switch (ib->type) {
+              case Brig::BRIG_TYPE_B8:
+              case Brig::BRIG_TYPE_B16:
+              case Brig::BRIG_TYPE_B32:
+                return decodeLd2<MemDT, B32>(ib, obj);
+              case Brig::BRIG_TYPE_U8:
+              case Brig::BRIG_TYPE_U16:
+              case Brig::BRIG_TYPE_U32:
+                return decodeLd2<MemDT, U32>(ib, obj);
+              case Brig::BRIG_TYPE_S8:
+              case Brig::BRIG_TYPE_S16:
+              case Brig::BRIG_TYPE_S32:
+                return decodeLd2<MemDT, S32>(ib, obj);
+              case Brig::BRIG_TYPE_F16:
+              case Brig::BRIG_TYPE_F32:
+                return decodeLd2<MemDT, U32>(ib, obj);
+              default:
+                fatal("Bad ld register operand type %d, %d\n",
+                      dest.regKind, ib->type);
+            };
+          case Brig::BRIG_REGISTER_KIND_DOUBLE:
+            switch (ib->type) {
+              case Brig::BRIG_TYPE_B64:
+                return decodeLd2<MemDT, B64>(ib, obj);
+              case Brig::BRIG_TYPE_U64:
+                return decodeLd2<MemDT, U64>(ib, obj);
+              case Brig::BRIG_TYPE_S64:
+                return decodeLd2<MemDT, S64>(ib, obj);
+              case Brig::BRIG_TYPE_F64:
+                return decodeLd2<MemDT, U64>(ib, obj);
+              default:
+                fatal("Bad ld register operand type %d, %d\n",
+                      dest.regKind, ib->type);
+            };
+          default:
+            fatal("Bad ld register operand type %d, %d\n", dest.regKind,
+                  ib->type);
+        }
+    }
+
+    template<typename MemDataType, typename SrcOperandType,
+             typename AddrOperandType>
+    class StInstBase : public HsailGPUStaticInst
+    {
+      public:
+        typename SrcOperandType::SrcOperand src;
+        AddrOperandType addr;
+
+        Brig::BrigSegment segment;
+        Brig::BrigMemoryScope memoryScope;
+        Brig::BrigMemoryOrder memoryOrder;
+        unsigned int equivClass;
+
+        void
+        initSt(const Brig::BrigInstBase *ib, const BrigObject *obj,
+               const char *_opcode)
+        {
+            using namespace Brig;
+
+            const BrigInstMem *ldst = (const BrigInstMem*)ib;
+
+            segment = (BrigSegment)ldst->segment;
+            memoryOrder = BRIG_MEMORY_ORDER_NONE;
+            memoryScope = BRIG_MEMORY_SCOPE_NONE;
+            equivClass = ldst->equivClass;
+
+            switch (segment) {
+              case BRIG_SEGMENT_GLOBAL:
+                o_type = Enums::OT_GLOBAL_WRITE;
+                break;
+
+              case BRIG_SEGMENT_GROUP:
+                o_type = Enums::OT_SHARED_WRITE;
+                break;
+
+              case BRIG_SEGMENT_PRIVATE:
+                o_type = Enums::OT_PRIVATE_WRITE;
+                break;
+
+              case BRIG_SEGMENT_READONLY:
+                o_type = Enums::OT_READONLY_WRITE;
+                break;
+
+              case BRIG_SEGMENT_SPILL:
+                o_type = Enums::OT_SPILL_WRITE;
+                break;
+
+              case BRIG_SEGMENT_FLAT:
+                o_type = Enums::OT_FLAT_WRITE;
+                break;
+
+              case BRIG_SEGMENT_ARG:
+                o_type = Enums::OT_ARG;
+                break;
+
+              default:
+                panic("St: segment %d not supported\n", segment);
+            }
+
+            unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+            const BrigOperand *baseOp = obj->getOperand(op_offs);
+
+            if ((baseOp->kind == BRIG_KIND_OPERAND_CONSTANT_BYTES) ||
+                (baseOp->kind == BRIG_KIND_OPERAND_REGISTER)) {
+                src.init(op_offs, obj);
+            }
+
+            op_offs = obj->getOperandPtr(ib->operands, 1);
+            addr.init(op_offs, obj);
+        }
+
+        void
+        initAtomicSt(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                     const char *_opcode)
+        {
+            using namespace Brig;
+
+            const BrigInstAtomic *at = (const BrigInstAtomic*)ib;
+
+            segment = (BrigSegment)at->segment;
+            memoryScope = (BrigMemoryScope)at->memoryScope;
+            memoryOrder = (BrigMemoryOrder)at->memoryOrder;
+            equivClass = 0;
+
+            switch (segment) {
+              case BRIG_SEGMENT_GLOBAL:
+                o_type = Enums::OT_GLOBAL_WRITE;
+                break;
+
+              case BRIG_SEGMENT_GROUP:
+                o_type = Enums::OT_SHARED_WRITE;
+                break;
+
+              case BRIG_SEGMENT_PRIVATE:
+                o_type = Enums::OT_PRIVATE_WRITE;
+                break;
+
+              case BRIG_SEGMENT_READONLY:
+                o_type = Enums::OT_READONLY_WRITE;
+                break;
+
+              case BRIG_SEGMENT_SPILL:
+                o_type = Enums::OT_SPILL_WRITE;
+                break;
+
+              case BRIG_SEGMENT_FLAT:
+                o_type = Enums::OT_FLAT_WRITE;
+                break;
+
+              case BRIG_SEGMENT_ARG:
+                o_type = Enums::OT_ARG;
+                break;
+
+              default:
+                panic("St: segment %d not supported\n", segment);
+            }
+
+            unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+            addr.init(op_offs, obj);
+
+            op_offs = obj->getOperandPtr(ib->operands, 1);
+            src.init(op_offs, obj);
+        }
+
+        StInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                   const char *_opcode)
+           : HsailGPUStaticInst(obj, _opcode)
+        {
+            using namespace Brig;
+
+            if (ib->opcode == BRIG_OPCODE_ST) {
+                initSt(ib, obj, _opcode);
+            } else {
+                initAtomicSt(ib, obj, _opcode);
+            }
+        }
+
+        int numDstRegOperands() { return 0; }
+        int numSrcRegOperands()
+        {
+            return src.isVectorRegister() + this->addr.isVectorRegister();
+        }
+        int getNumOperands()
+        {
+            if (this->addr.isVectorRegister() || this->addr.isScalarRegister())
+                return 2;
+            else
+                return 1;
+        }
+        bool isVectorRegister(int operandIndex)
+        {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            return !operandIndex ? src.isVectorRegister() :
+                   this->addr.isVectorRegister();
+        }
+        bool isCondRegister(int operandIndex)
+        {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            return !operandIndex ? src.isCondRegister() :
+                   this->addr.isCondRegister();
+        }
+        bool isScalarRegister(int operandIndex)
+        {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            return !operandIndex ? src.isScalarRegister() :
+                   this->addr.isScalarRegister();
+        }
+        bool isSrcOperand(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return true;
+        }
+        bool isDstOperand(int operandIndex) { return false; }
+        int getOperandSize(int operandIndex)
+        {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            return !operandIndex ? src.opSize() : this->addr.opSize();
+        }
+        int getRegisterIndex(int operandIndex)
+        {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            return !operandIndex ? src.regIndex() : this->addr.regIndex();
+        }
+    };
+
+
+    template<typename MemDataType, typename SrcDataType,
+             typename AddrOperandType>
+    class StInst :
+        public StInstBase<MemDataType, typename SrcDataType::OperandType,
+                          AddrOperandType>,
+        public MemInst
+    {
+      public:
+        typename SrcDataType::OperandType::SrcOperand src_vect[4];
+        uint16_t num_src_operands;
+        void generateDisassembly();
+
+        StInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                        const char *_opcode, int srcIdx)
+            : StInstBase<MemDataType, typename SrcDataType::OperandType,
+                         AddrOperandType>(ib, obj, _opcode),
+              MemInst(SrcDataType::memType)
+        {
+            init_addr(&this->addr);
+
+            BrigRegOperandInfo rinfo;
+            unsigned op_offs = obj->getOperandPtr(ib->operands,srcIdx);
+            const Brig::BrigOperand *baseOp = obj->getOperand(op_offs);
+
+            if (baseOp->kind == Brig::BRIG_KIND_OPERAND_CONSTANT_BYTES) {
+                const Brig::BrigOperandConstantBytes *op =
+                    (Brig::BrigOperandConstantBytes*)baseOp;
+
+                rinfo = BrigRegOperandInfo((Brig::BrigKind16_t)op->base.kind,
+                                           Brig::BRIG_TYPE_NONE);
+            } else {
+                rinfo = findRegDataType(op_offs, obj);
+            }
+
+            if (baseOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST) {
+                const Brig::BrigOperandOperandList *brigRegVecOp =
+                    (const Brig::BrigOperandOperandList*)baseOp;
+
+                num_src_operands =
+                    *((unsigned*)obj->getData(brigRegVecOp->elements)) / 4;
+
+                assert(num_src_operands <= 4);
+            } else {
+                num_src_operands = 1;
+            }
+
+            if (num_src_operands > 1) {
+                assert(baseOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST);
+
+                for (int i = 0; i < num_src_operands; ++i) {
+                    src_vect[i].init_from_vect(op_offs, obj, i);
+                }
+            }
+        }
+
+        void
+        initiateAcc(GPUDynInstPtr gpuDynInst) override
+        {
+            // before performing a store, check if this store has
+            // release semantics, and if so issue a release first
+            if (!isLocalMem()) {
+                if (gpuDynInst->computeUnit()->shader->separate_acquire_release
+                    && gpuDynInst->memoryOrder ==
+                    Enums::MEMORY_ORDER_SC_RELEASE) {
+
+                    gpuDynInst->statusBitVector = VectorMask(1);
+                    gpuDynInst->execContinuation = &GPUStaticInst::execSt;
+                    gpuDynInst->useContinuation = true;
+                    // create request
+                    Request *req = new Request(0, 0, 0, 0,
+                                  gpuDynInst->computeUnit()->masterId(),
+                                  0, gpuDynInst->wfDynId, -1);
+                    req->setFlags(Request::RELEASE);
+                    gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req);
+
+                    return;
+                }
+            }
+
+            // if there is no release semantic, perform stores immediately
+            execSt(gpuDynInst);
+        }
+
+        bool
+        isLocalMem() const override
+        {
+            return this->segment == Brig::BRIG_SEGMENT_GROUP;
+        }
+
+      private:
+        // execSt may be called through a continuation
+        // if the store had release semantics. see comment for
+        // execSt in gpu_static_inst.hh
+        void
+        execSt(GPUDynInstPtr gpuDynInst) override
+        {
+            typedef typename MemDataType::CType c0;
+
+            gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
+
+            if (num_src_operands > 1) {
+                for (int i = 0; i < VSZ; ++i)
+                    if (gpuDynInst->exec_mask[i])
+                        gpuDynInst->statusVector.push_back(num_src_operands);
+                    else
+                        gpuDynInst->statusVector.push_back(0);
+            }
+
+            for (int k = 0; k < num_src_operands; ++k) {
+                c0 *d = &((c0*)gpuDynInst->d_data)[k * VSZ];
+
+                for (int i = 0; i < VSZ; ++i) {
+                    if (gpuDynInst->exec_mask[i]) {
+                        Addr vaddr = gpuDynInst->addr[i] + k * sizeof(c0);
+
+                        if (isLocalMem()) {
+                            //store to shared memory
+                            gpuDynInst->wavefront()->ldsChunk->write<c0>(vaddr,
+                                                                         *d);
+                        } else {
+                            Request *req =
+                              new Request(0, vaddr, sizeof(c0), 0,
+                                          gpuDynInst->computeUnit()->masterId(),
+                                          0, gpuDynInst->wfDynId, i);
+
+                            gpuDynInst->setRequestFlags(req);
+                            PacketPtr pkt = new Packet(req, MemCmd::WriteReq);
+                            pkt->dataStatic<c0>(d);
+
+                            // translation is performed in sendRequest()
+                            // the request will be finished when the store completes
+                            gpuDynInst->useContinuation = false;
+                            gpuDynInst->computeUnit()->sendRequest(gpuDynInst,
+                                                                   i, pkt);
+
+                        }
+                    }
+                    ++d;
+                }
+            }
+
+            gpuDynInst->updateStats();
+        }
+
+      public:
+        bool isVectorRegister(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex == num_src_operands)
+                return this->addr.isVectorRegister();
+            if (num_src_operands > 1)
+                return src_vect[operandIndex].isVectorRegister();
+            else if (num_src_operands == 1)
+                return StInstBase<MemDataType,
+                       typename SrcDataType::OperandType,
+                       AddrOperandType>::src.isVectorRegister();
+            return false;
+        }
+        bool isCondRegister(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex == num_src_operands)
+                return this->addr.isCondRegister();
+            if (num_src_operands > 1)
+                return src_vect[operandIndex].isCondRegister();
+            else if (num_src_operands == 1)
+                return StInstBase<MemDataType,
+                       typename SrcDataType::OperandType,
+                       AddrOperandType>::src.isCondRegister();
+            return false;
+        }
+        bool isScalarRegister(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex == num_src_operands)
+                return this->addr.isScalarRegister();
+            if (num_src_operands > 1)
+                return src_vect[operandIndex].isScalarRegister();
+            else if (num_src_operands == 1)
+                return StInstBase<MemDataType,
+                       typename SrcDataType::OperandType,
+                       AddrOperandType>::src.isScalarRegister();
+            return false;
+        }
+        bool isSrcOperand(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return true;
+        }
+        bool isDstOperand(int operandIndex) { return false; }
+        int getOperandSize(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex == num_src_operands)
+                return this->addr.opSize();
+            if (num_src_operands > 1)
+                return src_vect[operandIndex].opSize();
+            else if (num_src_operands == 1)
+                return StInstBase<MemDataType,
+                       typename SrcDataType::OperandType,
+                       AddrOperandType>::src.opSize();
+            return 0;
+        }
+        int getRegisterIndex(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex == num_src_operands)
+                return this->addr.regIndex();
+            if (num_src_operands > 1)
+                return src_vect[operandIndex].regIndex();
+            else if (num_src_operands == 1)
+                return StInstBase<MemDataType,
+                       typename SrcDataType::OperandType,
+                       AddrOperandType>::src.regIndex();
+            return -1;
+        }
+        int getNumOperands()
+        {
+            if (this->addr.isVectorRegister() || this->addr.isScalarRegister())
+                return num_src_operands + 1;
+            else
+                return num_src_operands;
+        }
+        void execute(GPUDynInstPtr gpuDynInst);
+    };
+
+    template<typename DataType, typename SrcDataType>
+    GPUStaticInst*
+    decodeSt(const Brig::BrigInstBase *ib, const BrigObject *obj)
+    {
+        int srcIdx = 0;
+        int destIdx = 1;
+        if (ib->opcode == Brig::BRIG_OPCODE_ATOMIC ||
+            ib->opcode == Brig::BRIG_OPCODE_ATOMICNORET) {
+            srcIdx = 1;
+            destIdx = 0;
+        }
+        unsigned op_offs = obj->getOperandPtr(ib->operands,destIdx);
+
+        BrigRegOperandInfo tmp = findRegDataType(op_offs, obj);
+
+        if (tmp.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) {
+            return new StInst<DataType, SrcDataType,
+                              NoRegAddrOperand>(ib, obj, "st", srcIdx);
+        } else if (tmp.kind == Brig::BRIG_KIND_OPERAND_REGISTER) {
+            // V2/V4 not allowed
+            switch (tmp.regKind) {
+              case Brig::BRIG_REGISTER_KIND_SINGLE:
+                return new StInst<DataType, SrcDataType,
+                                  SRegAddrOperand>(ib, obj, "st", srcIdx);
+              case Brig::BRIG_REGISTER_KIND_DOUBLE:
+                return new StInst<DataType, SrcDataType,
+                                  DRegAddrOperand>(ib, obj, "st", srcIdx);
+              default:
+                fatal("Bad st register operand type %d\n", tmp.type);
+            }
+        } else {
+            fatal("Bad st register operand kind %d\n", tmp.kind);
+        }
+    }
+
+    Enums::MemOpType brigAtomicToMemOpType(Brig::BrigOpcode brigOpCode,
+                                           Brig::BrigAtomicOperation brigOp);
+
+    template<typename OperandType, typename AddrOperandType, int NumSrcOperands,
+             bool HasDst>
+    class AtomicInstBase : public HsailGPUStaticInst
+    {
+      public:
+        typename OperandType::DestOperand dest;
+        typename OperandType::SrcOperand src[NumSrcOperands];
+        AddrOperandType addr;
+
+        Brig::BrigSegment segment;
+        Brig::BrigMemoryOrder memoryOrder;
+        Brig::BrigAtomicOperation atomicOperation;
+        Brig::BrigMemoryScope memoryScope;
+        Brig::BrigOpcode opcode;
+        Enums::MemOpType opType;
+
+        AtomicInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                       const char *_opcode)
+           : HsailGPUStaticInst(obj, _opcode)
+        {
+            using namespace Brig;
+
+            const BrigInstAtomic *at = (const BrigInstAtomic*)ib;
+
+            segment = (BrigSegment)at->segment;
+            memoryScope = (BrigMemoryScope)at->memoryScope;
+            memoryOrder = (BrigMemoryOrder)at->memoryOrder;
+            atomicOperation = (BrigAtomicOperation)at->atomicOperation;
+            opcode = (BrigOpcode)ib->opcode;
+            opType = brigAtomicToMemOpType(opcode, atomicOperation);
+
+            switch (segment) {
+              case BRIG_SEGMENT_GLOBAL:
+                o_type = Enums::OT_GLOBAL_ATOMIC;
+                break;
+
+              case BRIG_SEGMENT_GROUP:
+                o_type = Enums::OT_SHARED_ATOMIC;
+                break;
+
+              case BRIG_SEGMENT_FLAT:
+                o_type = Enums::OT_FLAT_ATOMIC;
+                break;
+
+              default:
+                panic("Atomic: segment %d not supported\n", segment);
+            }
+
+            if (HasDst) {
+                unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+                dest.init(op_offs, obj);
+
+                op_offs = obj->getOperandPtr(ib->operands, 1);
+                addr.init(op_offs, obj);
+
+                for (int i = 0; i < NumSrcOperands; ++i) {
+                    op_offs = obj->getOperandPtr(ib->operands, i + 2);
+                    src[i].init(op_offs, obj);
+                }
+            } else {
+
+                unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+                addr.init(op_offs, obj);
+
+                for (int i = 0; i < NumSrcOperands; ++i) {
+                    op_offs = obj->getOperandPtr(ib->operands, i + 1);
+                    src[i].init(op_offs, obj);
+                }
+            }
+        }
+
+        int numSrcRegOperands()
+        {
+            int operands = 0;
+            for (int i = 0; i < NumSrcOperands; i++) {
+                if (src[i].isVectorRegister() == true) {
+                    operands++;
+                }
+            }
+            if (addr.isVectorRegister())
+                operands++;
+            return operands;
+        }
+        int numDstRegOperands() { return dest.isVectorRegister(); }
+        int getNumOperands()
+        {
+            if (addr.isVectorRegister())
+                return(NumSrcOperands + 2);
+            return(NumSrcOperands + 1);
+        }
+        bool isVectorRegister(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex < NumSrcOperands)
+                return src[operandIndex].isVectorRegister();
+            else if (operandIndex == NumSrcOperands)
+                return(addr.isVectorRegister());
+            else
+                return dest.isVectorRegister();
+        }
+        bool isCondRegister(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex < NumSrcOperands)
+                return src[operandIndex].isCondRegister();
+            else if (operandIndex == NumSrcOperands)
+                return(addr.isCondRegister());
+            else
+                return dest.isCondRegister();
+        }
+        bool isScalarRegister(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex < NumSrcOperands)
+                return src[operandIndex].isScalarRegister();
+            else if (operandIndex == NumSrcOperands)
+                return(addr.isScalarRegister());
+            else
+                return dest.isScalarRegister();
+        }
+        bool isSrcOperand(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex < NumSrcOperands)
+                return true;
+            else if (operandIndex == NumSrcOperands)
+                return(addr.isVectorRegister());
+            else
+                return false;
+        }
+        bool isDstOperand(int operandIndex)
+        {
+            if (operandIndex <= NumSrcOperands)
+                return false;
+            else
+                return true;
+        }
+        int getOperandSize(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex < NumSrcOperands)
+                return(src[operandIndex].opSize());
+            else if (operandIndex == NumSrcOperands)
+                return(addr.opSize());
+            else
+                return(dest.opSize());
+        }
+        int getRegisterIndex(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex < NumSrcOperands)
+                return(src[operandIndex].regIndex());
+            else if (operandIndex == NumSrcOperands)
+                return(addr.regIndex());
+            else
+                return(dest.regIndex());
+            return -1;
+        }
+    };
+
+    template<typename MemDataType, typename AddrOperandType, int NumSrcOperands,
+             bool HasDst>
+    class AtomicInst :
+        public AtomicInstBase<typename MemDataType::OperandType,
+                              AddrOperandType, NumSrcOperands, HasDst>,
+        public MemInst
+    {
+      public:
+        void generateDisassembly();
+
+        AtomicInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                   const char *_opcode)
+            : AtomicInstBase<typename MemDataType::OperandType, AddrOperandType,
+                             NumSrcOperands, HasDst>
+                (ib, obj, _opcode),
+              MemInst(MemDataType::memType)
+        {
+            init_addr(&this->addr);
+        }
+
+        void
+        initiateAcc(GPUDynInstPtr gpuDynInst) override
+        {
+            // before doing the RMW, check if this atomic has
+            // release semantics, and if so issue a release first
+            if (!isLocalMem()) {
+                if (gpuDynInst->computeUnit()->shader->separate_acquire_release
+                    && (gpuDynInst->memoryOrder ==
+                    Enums::MEMORY_ORDER_SC_RELEASE || gpuDynInst->memoryOrder ==
+                    Enums::MEMORY_ORDER_SC_ACQUIRE_RELEASE)) {
+
+                    gpuDynInst->statusBitVector = VectorMask(1);
+
+                    gpuDynInst->execContinuation = &GPUStaticInst::execAtomic;
+                    gpuDynInst->useContinuation = true;
+
+                    // create request
+                    Request *req = new Request(0, 0, 0, 0,
+                                  gpuDynInst->computeUnit()->masterId(),
+                                  0, gpuDynInst->wfDynId, -1);
+                    req->setFlags(Request::RELEASE);
+                    gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req);
+
+                    return;
+                }
+            }
+
+            // if there is no release semantic, execute the RMW immediately
+            execAtomic(gpuDynInst);
+
+        }
+
+        void execute(GPUDynInstPtr gpuDynInst);
+
+        bool
+        isLocalMem() const override
+        {
+            return this->segment == Brig::BRIG_SEGMENT_GROUP;
+        }
+
+      private:
+        // execAtomic may be called through a continuation
+        // if the RMW had release semantics. see comment for
+        // execContinuation in gpu_dyn_inst.hh
+        void
+        execAtomic(GPUDynInstPtr gpuDynInst) override
+        {
+            gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
+
+            typedef typename MemDataType::CType c0;
+
+            c0 *d = &((c0*) gpuDynInst->d_data)[0];
+            c0 *e = &((c0*) gpuDynInst->a_data)[0];
+            c0 *f = &((c0*) gpuDynInst->x_data)[0];
+
+            for (int i = 0; i < VSZ; ++i) {
+                if (gpuDynInst->exec_mask[i]) {
+                    Addr vaddr = gpuDynInst->addr[i];
+
+                    if (isLocalMem()) {
+                        Wavefront *wavefront = gpuDynInst->wavefront();
+                        *d = wavefront->ldsChunk->read<c0>(vaddr);
+
+                        switch (this->opType) {
+                          case Enums::MO_AADD:
+                          case Enums::MO_ANRADD:
+                            wavefront->ldsChunk->write<c0>(vaddr,
+                            wavefront->ldsChunk->read<c0>(vaddr) + (*e));
+                            break;
+                          case Enums::MO_ASUB:
+                          case Enums::MO_ANRSUB:
+                            wavefront->ldsChunk->write<c0>(vaddr,
+                            wavefront->ldsChunk->read<c0>(vaddr) - (*e));
+                            break;
+                          case Enums::MO_AMAX:
+                          case Enums::MO_ANRMAX:
+                            wavefront->ldsChunk->write<c0>(vaddr,
+                            std::max(wavefront->ldsChunk->read<c0>(vaddr),
+                            (*e)));
+                            break;
+                          case Enums::MO_AMIN:
+                          case Enums::MO_ANRMIN:
+                            wavefront->ldsChunk->write<c0>(vaddr,
+                            std::min(wavefront->ldsChunk->read<c0>(vaddr),
+                            (*e)));
+                            break;
+                          case Enums::MO_AAND:
+                          case Enums::MO_ANRAND:
+                            wavefront->ldsChunk->write<c0>(vaddr,
+                            wavefront->ldsChunk->read<c0>(vaddr) & (*e));
+                            break;
+                          case Enums::MO_AOR:
+                          case Enums::MO_ANROR:
+                            wavefront->ldsChunk->write<c0>(vaddr,
+                            wavefront->ldsChunk->read<c0>(vaddr) | (*e));
+                            break;
+                          case Enums::MO_AXOR:
+                          case Enums::MO_ANRXOR:
+                            wavefront->ldsChunk->write<c0>(vaddr,
+                            wavefront->ldsChunk->read<c0>(vaddr) ^ (*e));
+                            break;
+                          case Enums::MO_AINC:
+                          case Enums::MO_ANRINC:
+                            wavefront->ldsChunk->write<c0>(vaddr,
+                            wavefront->ldsChunk->read<c0>(vaddr) + 1);
+                            break;
+                          case Enums::MO_ADEC:
+                          case Enums::MO_ANRDEC:
+                            wavefront->ldsChunk->write<c0>(vaddr,
+                            wavefront->ldsChunk->read<c0>(vaddr) - 1);
+                            break;
+                          case Enums::MO_AEXCH:
+                          case Enums::MO_ANREXCH:
+                            wavefront->ldsChunk->write<c0>(vaddr, (*e));
+                            break;
+                          case Enums::MO_ACAS:
+                          case Enums::MO_ANRCAS:
+                            wavefront->ldsChunk->write<c0>(vaddr,
+                            (wavefront->ldsChunk->read<c0>(vaddr) == (*e)) ?
+                            (*f) : wavefront->ldsChunk->read<c0>(vaddr));
+                            break;
+                          default:
+                            fatal("Unrecognized or invalid HSAIL atomic op "
+                                  "type.\n");
+                            break;
+                        }
+                    } else {
+                        Request *req =
+                            new Request(0, vaddr, sizeof(c0), 0,
+                                        gpuDynInst->computeUnit()->masterId(),
+                                        0, gpuDynInst->wfDynId, i,
+                                        gpuDynInst->makeAtomicOpFunctor<c0>(e,
+                                        f, this->opType));
+
+                        gpuDynInst->setRequestFlags(req);
+                        PacketPtr pkt = new Packet(req, MemCmd::SwapReq);
+                        pkt->dataStatic(d);
+
+                        if (gpuDynInst->computeUnit()->shader->
+                            separate_acquire_release &&
+                            (gpuDynInst->memoryOrder ==
+                             Enums::MEMORY_ORDER_SC_ACQUIRE)) {
+                            // if this atomic has acquire semantics,
+                            // schedule the continuation to perform an
+                            // acquire after the RMW completes
+                            gpuDynInst->execContinuation =
+                                &GPUStaticInst::execAtomicAcq;
+
+                            gpuDynInst->useContinuation = true;
+                        } else {
+                            // the request will be finished when the RMW completes
+                            gpuDynInst->useContinuation = false;
+                        }
+                        // translation is performed in sendRequest()
+                        gpuDynInst->computeUnit()->sendRequest(gpuDynInst, i,
+                                                               pkt);
+                    }
+                }
+
+                ++d;
+                ++e;
+                ++f;
+            }
+
+            gpuDynInst->updateStats();
+        }
+
+        // execAtomicACq will always be called through a continuation.
+        // see comment for execContinuation in gpu_dyn_inst.hh
+        void
+        execAtomicAcq(GPUDynInstPtr gpuDynInst) override
+        {
+            // after performing the RMW, check to see if this instruction
+            // has acquire semantics, and if so, issue an acquire
+            if (!isLocalMem()) {
+                if (gpuDynInst->computeUnit()->shader->separate_acquire_release
+                     && gpuDynInst->memoryOrder ==
+                     Enums::MEMORY_ORDER_SC_ACQUIRE) {
+                    gpuDynInst->statusBitVector = VectorMask(1);
+
+                    // the request will be finished when
+                    // the acquire completes
+                    gpuDynInst->useContinuation = false;
+                    // create request
+                    Request *req = new Request(0, 0, 0, 0,
+                                  gpuDynInst->computeUnit()->masterId(),
+                                  0, gpuDynInst->wfDynId, -1);
+                    req->setFlags(Request::ACQUIRE);
+                    gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req);
+                }
+            }
+        }
+    };
+
+    template<typename DataType, typename AddrOperandType, int NumSrcOperands>
+    GPUStaticInst*
+    constructAtomic(const Brig::BrigInstBase *ib, const BrigObject *obj)
+    {
+        const Brig::BrigInstAtomic *at = (const Brig::BrigInstAtomic*)ib;
+
+        if (at->atomicOperation == Brig::BRIG_ATOMIC_LD) {
+            return decodeLd<DataType>(ib, obj);
+        } else if (at->atomicOperation == Brig::BRIG_ATOMIC_ST) {
+            switch (ib->type) {
+              case Brig::BRIG_TYPE_B8:
+                return decodeSt<S8,S8>(ib, obj);
+              case Brig::BRIG_TYPE_B16:
+                return decodeSt<S8,S16>(ib, obj);
+              case Brig::BRIG_TYPE_B32:
+                return decodeSt<S8,S32>(ib, obj);
+              case Brig::BRIG_TYPE_B64:
+                return decodeSt<S8,S64>(ib, obj);
+              default: fatal("AtomicSt: Operand type mismatch %d\n", ib->type);
+            }
+        } else {
+            if ((Brig::BrigOpcode)ib->opcode == Brig::BRIG_OPCODE_ATOMICNORET)
+                return new AtomicInst<DataType, AddrOperandType,
+                    NumSrcOperands, false>(ib, obj, "atomicnoret");
+            else
+                return new AtomicInst<DataType, AddrOperandType,
+                    NumSrcOperands, true>(ib, obj, "atomic");
+        }
+    }
+
+    template<typename DataType, int NumSrcOperands>
+    GPUStaticInst*
+    decodeAtomicHelper(const Brig::BrigInstBase *ib, const BrigObject *obj)
+    {
+        unsigned addrIndex = (Brig::BrigOpcode)ib->opcode ==
+            Brig::BRIG_OPCODE_ATOMICNORET ? 0 : 1;
+
+        unsigned op_offs = obj->getOperandPtr(ib->operands,addrIndex);
+
+        BrigRegOperandInfo tmp = findRegDataType(op_offs, obj);
+
+        if (tmp.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) {
+            return constructAtomic<DataType, NoRegAddrOperand,
+                                   NumSrcOperands>(ib, obj);
+        } else if (tmp.kind == Brig::BRIG_KIND_OPERAND_REGISTER) {
+            // V2/V4 not allowed
+            switch (tmp.regKind) {
+              case Brig::BRIG_REGISTER_KIND_SINGLE:
+                  return constructAtomic<DataType, SRegAddrOperand,
+                                         NumSrcOperands>(ib, obj);
+              case Brig::BRIG_REGISTER_KIND_DOUBLE:
+                return constructAtomic<DataType, DRegAddrOperand,
+                                       NumSrcOperands>(ib, obj);
+              default:
+                fatal("Bad atomic register operand type %d\n", tmp.type);
+            }
+        } else {
+            fatal("Bad atomic register operand kind %d\n", tmp.kind);
+        }
+    }
+
+
+    template<typename DataType>
+    GPUStaticInst*
+    decodeAtomic(const Brig::BrigInstBase *ib, const BrigObject *obj)
+    {
+        const Brig::BrigInstAtomic *at = (const Brig::BrigInstAtomic*)ib;
+
+        if (at->atomicOperation == Brig::BRIG_ATOMIC_CAS) {
+            return decodeAtomicHelper<DataType, 2>(ib, obj);
+        } else {
+            return decodeAtomicHelper<DataType, 1>(ib, obj);
+        }
+    }
+
+    template<typename DataType>
+    GPUStaticInst*
+    decodeAtomicNoRet(const Brig::BrigInstBase *ib, const BrigObject *obj)
+    {
+        const Brig::BrigInstAtomic *at = (const Brig::BrigInstAtomic*)ib;
+        if (at->atomicOperation == Brig::BRIG_ATOMIC_CAS) {
+            return decodeAtomicHelper<DataType, 2>(ib, obj);
+        } else {
+            return decodeAtomicHelper<DataType, 1>(ib, obj);
+        }
+    }
+} // namespace HsailISA
+
+#endif // __ARCH_HSAIL_INSTS_MEM_HH__
diff --git a/src/arch/hsail/insts/mem_impl.hh b/src/arch/hsail/insts/mem_impl.hh
new file mode 100644
index 000000000..94f0cd6aa
--- /dev/null
+++ b/src/arch/hsail/insts/mem_impl.hh
@@ -0,0 +1,660 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#include "arch/hsail/generic_types.hh"
+#include "gpu-compute/hsail_code.hh"
+
+// defined in code.cc, but not worth sucking in all of code.h for this
+// at this point
+extern const char *segmentNames[];
+
+namespace HsailISA
+{
+    template<typename DestDataType, typename AddrRegOperandType>
+    void
+    LdaInst<DestDataType, AddrRegOperandType>::generateDisassembly()
+    {
+        this->disassembly = csprintf("%s_%s %s,%s", this->opcode,
+                                     DestDataType::label,
+                                     this->dest.disassemble(),
+                                     this->addr.disassemble());
+    }
+
+    template<typename DestDataType, typename AddrRegOperandType>
+    void
+    LdaInst<DestDataType, AddrRegOperandType>::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *w = gpuDynInst->wavefront();
+
+        typedef typename DestDataType::CType CType M5_VAR_USED;
+        const VectorMask &mask = w->get_pred();
+        uint64_t addr_vec[VSZ];
+        this->addr.calcVector(w, addr_vec);
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (mask[lane]) {
+                this->dest.set(w, lane, addr_vec[lane]);
+            }
+        }
+    }
+
+    template<typename MemDataType, typename DestDataType,
+             typename AddrRegOperandType>
+    void
+    LdInst<MemDataType, DestDataType, AddrRegOperandType>::generateDisassembly()
+    {
+        switch (num_dest_operands) {
+          case 1:
+            this->disassembly = csprintf("%s_%s_%s %s,%s", this->opcode,
+                                         segmentNames[this->segment],
+                                         MemDataType::label,
+                                         this->dest.disassemble(),
+                                         this->addr.disassemble());
+            break;
+          case 2:
+            this->disassembly = csprintf("%s_%s_%s (%s,%s), %s", this->opcode,
+                                         segmentNames[this->segment],
+                                         MemDataType::label,
+                                         this->dest_vect[0].disassemble(),
+                                         this->dest_vect[1].disassemble(),
+                                         this->addr.disassemble());
+            break;
+          case 4:
+            this->disassembly = csprintf("%s_%s_%s (%s,%s,%s,%s), %s",
+                                         this->opcode,
+                                         segmentNames[this->segment],
+                                         MemDataType::label,
+                                         this->dest_vect[0].disassemble(),
+                                         this->dest_vect[1].disassemble(),
+                                         this->dest_vect[2].disassemble(),
+                                         this->dest_vect[3].disassemble(),
+                                         this->addr.disassemble());
+            break;
+          default:
+            fatal("Bad ld register dest operand, num vector operands: %d \n",
+                  num_dest_operands);
+            break;
+        }
+    }
+
+    static Addr
+    calcPrivAddr(Addr addr, Wavefront *w, int lane, GPUStaticInst *i)
+    {
+        // what is the size of the object we are accessing??
+        // NOTE: the compiler doesn't generate enough information
+        // to do this yet..have to just line up all the private
+        // work-item spaces back to back for now
+        /*
+        StorageElement* se =
+            i->parent->findSymbol(Brig::BrigPrivateSpace, addr);
+        assert(se);
+
+        return w->wfSlotId * w->privSizePerItem * VSZ +
+            se->offset * VSZ +
+            lane * se->size;
+        */
+
+        // addressing strategy: interleave the private spaces of
+        // work-items in a wave-front on 8 byte granularity.
+        // this won't be perfect coalescing like the spill space
+        // strategy, but it's better than nothing. The spill space
+        // strategy won't work with private because the same address
+        // may be accessed by different sized loads/stores.
+
+        // Note: I'm assuming that the largest load/store to private
+        // is 8 bytes. If it is larger, the stride will have to increase
+
+        Addr addr_div8 = addr / 8;
+        Addr addr_mod8 = addr % 8;
+
+        Addr ret = addr_div8 * 8 * VSZ + lane * 8 + addr_mod8 + w->privBase;
+
+        assert(ret < w->privBase + (w->privSizePerItem * VSZ));
+
+        return ret;
+    }
+
+    template<typename MemDataType, typename DestDataType,
+             typename AddrRegOperandType>
+    void
+    LdInst<MemDataType, DestDataType,
+           AddrRegOperandType>::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *w = gpuDynInst->wavefront();
+
+        typedef typename MemDataType::CType MemCType;
+        const VectorMask &mask = w->get_pred();
+
+        // Kernarg references are handled uniquely for now (no Memory Request
+        // is used), so special-case them up front.  Someday we should
+        // make this more realistic, at which we should get rid of this
+        // block and fold this case into the switch below.
+        if (this->segment == Brig::BRIG_SEGMENT_KERNARG) {
+            MemCType val;
+
+            // I assume no vector ld for kernargs
+            assert(num_dest_operands == 1);
+
+            // assuming for the moment that we'll never do register
+            // offsets into kernarg space... just to make life simpler
+            uint64_t address = this->addr.calcUniform();
+
+            val = *(MemCType*)&w->kernelArgs[address];
+
+            DPRINTF(HSAIL, "ld_kernarg [%d] -> %d\n", address, val);
+
+            for (int lane = 0; lane < VSZ; ++lane) {
+                if (mask[lane]) {
+                    this->dest.set(w, lane, val);
+                }
+            }
+
+            return;
+        } else if (this->segment == Brig::BRIG_SEGMENT_ARG) {
+            uint64_t address = this->addr.calcUniform();
+            for (int lane = 0; lane < VSZ; ++lane) {
+                if (mask[lane]) {
+                    MemCType val = w->readCallArgMem<MemCType>(lane, address);
+
+                    DPRINTF(HSAIL, "ld_arg [%d] -> %llu\n", address,
+                            (unsigned long long)val);
+
+                    this->dest.set(w, lane, val);
+                }
+            }
+
+            return;
+        }
+
+        GPUDynInstPtr m = gpuDynInst;
+
+        this->addr.calcVector(w, m->addr);
+
+        m->m_op = Enums::MO_LD;
+        m->m_type = MemDataType::memType;
+        m->v_type = DestDataType::vgprType;
+
+        m->exec_mask = w->execMask();
+        m->statusBitVector = 0;
+        m->equiv = this->equivClass;
+        m->memoryOrder = getGenericMemoryOrder(this->memoryOrder);
+
+        m->scope = getGenericMemoryScope(this->memoryScope);
+
+        if (num_dest_operands == 1) {
+            m->dst_reg = this->dest.regIndex();
+            m->n_reg = 1;
+        } else {
+            m->n_reg = num_dest_operands;
+            for (int i = 0; i < num_dest_operands; ++i) {
+                m->dst_reg_vec[i] = this->dest_vect[i].regIndex();
+            }
+        }
+
+        m->simdId = w->simdId;
+        m->wfSlotId = w->wfSlotId;
+        m->wfDynId = w->wfDynId;
+        m->kern_id = w->kern_id;
+        m->cu_id = w->computeUnit->cu_id;
+        m->latency.init(&w->computeUnit->shader->tick_cnt);
+
+        switch (this->segment) {
+          case Brig::BRIG_SEGMENT_GLOBAL:
+            m->s_type = SEG_GLOBAL;
+            m->pipeId = GLBMEM_PIPE;
+            m->latency.set(w->computeUnit->shader->ticks(1));
+
+            // this is a complete hack to get around a compiler bug
+            // (the compiler currently generates global access for private
+            //  addresses (starting from 0). We need to add the private offset)
+            for (int lane = 0; lane < VSZ; ++lane) {
+                if (m->addr[lane] < w->privSizePerItem) {
+                    if (mask[lane]) {
+                        // what is the size of the object we are accessing?
+                        // find base for for this wavefront
+
+                        // calcPrivAddr will fail if accesses are unaligned
+                        assert(!((sizeof(MemCType) - 1) & m->addr[lane]));
+
+                        Addr privAddr = calcPrivAddr(m->addr[lane], w, lane,
+                                                     this);
+
+                        m->addr[lane] = privAddr;
+                    }
+                }
+            }
+
+            w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+            w->outstanding_reqs_rd_gm++;
+            w->rd_gm_reqs_in_pipe--;
+            break;
+
+          case Brig::BRIG_SEGMENT_SPILL:
+            assert(num_dest_operands == 1);
+            m->s_type = SEG_SPILL;
+            m->pipeId = GLBMEM_PIPE;
+            m->latency.set(w->computeUnit->shader->ticks(1));
+            {
+                for (int lane = 0; lane < VSZ; ++lane) {
+                    //  note: this calculation will NOT WORK if the compiler
+                    //  ever generates loads/stores to the same address with
+                    //  different widths (e.g., a ld_u32 addr and a ld_u16 addr)
+                    if (mask[lane]) {
+                        assert(m->addr[lane] < w->spillSizePerItem);
+
+                        m->addr[lane] = m->addr[lane] * w->spillWidth +
+                                        lane * sizeof(MemCType) + w->spillBase;
+
+                        w->last_addr[lane] = m->addr[lane];
+                    }
+                }
+            }
+
+            w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+            w->outstanding_reqs_rd_gm++;
+            w->rd_gm_reqs_in_pipe--;
+            break;
+
+          case Brig::BRIG_SEGMENT_GROUP:
+            m->s_type = SEG_SHARED;
+            m->pipeId = LDSMEM_PIPE;
+            m->latency.set(w->computeUnit->shader->ticks(24));
+            w->computeUnit->localMemoryPipe.getLMReqFIFO().push(m);
+            w->outstanding_reqs_rd_lm++;
+            w->rd_lm_reqs_in_pipe--;
+            break;
+
+          case Brig::BRIG_SEGMENT_READONLY:
+            m->s_type = SEG_READONLY;
+            m->pipeId = GLBMEM_PIPE;
+            m->latency.set(w->computeUnit->shader->ticks(1));
+
+            for (int lane = 0; lane < VSZ; ++lane) {
+                if (mask[lane]) {
+                    assert(m->addr[lane] + sizeof(MemCType) <= w->roSize);
+                    m->addr[lane] += w->roBase;
+                }
+            }
+
+            w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+            w->outstanding_reqs_rd_gm++;
+            w->rd_gm_reqs_in_pipe--;
+            break;
+
+          case Brig::BRIG_SEGMENT_PRIVATE:
+            m->s_type = SEG_PRIVATE;
+            m->pipeId = GLBMEM_PIPE;
+            m->latency.set(w->computeUnit->shader->ticks(1));
+            {
+                for (int lane = 0; lane < VSZ; ++lane) {
+                    if (mask[lane]) {
+                        assert(m->addr[lane] < w->privSizePerItem);
+
+                        m->addr[lane] = m->addr[lane] +
+                            lane * sizeof(MemCType) + w->privBase;
+                    }
+                }
+            }
+            w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+            w->outstanding_reqs_rd_gm++;
+            w->rd_gm_reqs_in_pipe--;
+            break;
+
+          default:
+            fatal("Load to unsupported segment %d %llxe\n", this->segment,
+                  m->addr[0]);
+        }
+
+        w->outstanding_reqs++;
+        w->mem_reqs_in_pipe--;
+    }
+
+    template<typename OperationType, typename SrcDataType,
+             typename AddrRegOperandType>
+    void
+    StInst<OperationType, SrcDataType,
+           AddrRegOperandType>::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *w = gpuDynInst->wavefront();
+
+        typedef typename OperationType::CType CType;
+
+        const VectorMask &mask = w->get_pred();
+
+        // arg references are handled uniquely for now (no Memory Request
+        // is used), so special-case them up front.  Someday we should
+        // make this more realistic, at which we should get rid of this
+        // block and fold this case into the switch below.
+        if (this->segment == Brig::BRIG_SEGMENT_ARG) {
+            uint64_t address = this->addr.calcUniform();
+
+            for (int lane = 0; lane < VSZ; ++lane) {
+                if (mask[lane]) {
+                    CType data = this->src.template get<CType>(w, lane);
+                    DPRINTF(HSAIL, "st_arg [%d] <- %d\n", address, data);
+                    w->writeCallArgMem<CType>(lane, address, data);
+                }
+            }
+
+            return;
+        }
+
+        GPUDynInstPtr m = gpuDynInst;
+
+        m->exec_mask = w->execMask();
+
+        this->addr.calcVector(w, m->addr);
+
+        if (num_src_operands == 1) {
+            for (int lane = 0; lane < VSZ; ++lane) {
+                if (mask[lane]) {
+                    ((CType*)m->d_data)[lane] =
+                        this->src.template get<CType>(w, lane);
+                }
+            }
+        } else {
+            for (int k= 0; k < num_src_operands; ++k) {
+                for (int lane = 0; lane < VSZ; ++lane) {
+                    if (mask[lane]) {
+                        ((CType*)m->d_data)[k * VSZ + lane] =
+                            this->src_vect[k].template get<CType>(w, lane);
+                    }
+                }
+            }
+        }
+
+        m->m_op = Enums::MO_ST;
+        m->m_type = OperationType::memType;
+        m->v_type = OperationType::vgprType;
+
+        m->statusBitVector = 0;
+        m->equiv = this->equivClass;
+
+        if (num_src_operands == 1) {
+            m->n_reg = 1;
+        } else {
+            m->n_reg = num_src_operands;
+        }
+
+        m->memoryOrder = getGenericMemoryOrder(this->memoryOrder);
+
+        m->scope = getGenericMemoryScope(this->memoryScope);
+
+        m->simdId = w->simdId;
+        m->wfSlotId = w->wfSlotId;
+        m->wfDynId = w->wfDynId;
+        m->kern_id = w->kern_id;
+        m->cu_id = w->computeUnit->cu_id;
+        m->latency.init(&w->computeUnit->shader->tick_cnt);
+
+        switch (this->segment) {
+          case Brig::BRIG_SEGMENT_GLOBAL:
+            m->s_type = SEG_GLOBAL;
+            m->pipeId = GLBMEM_PIPE;
+            m->latency.set(w->computeUnit->shader->ticks(1));
+
+            // this is a complete hack to get around a compiler bug
+            // (the compiler currently generates global access for private
+            //  addresses (starting from 0). We need to add the private offset)
+            for (int lane = 0; lane < VSZ; ++lane) {
+                if (mask[lane]) {
+                    if (m->addr[lane] < w->privSizePerItem) {
+
+                        // calcPrivAddr will fail if accesses are unaligned
+                        assert(!((sizeof(CType)-1) & m->addr[lane]));
+
+                        Addr privAddr = calcPrivAddr(m->addr[lane], w, lane,
+                                                     this);
+
+                        m->addr[lane] = privAddr;
+                    }
+                }
+            }
+
+            w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+            w->outstanding_reqs_wr_gm++;
+            w->wr_gm_reqs_in_pipe--;
+            break;
+
+          case Brig::BRIG_SEGMENT_SPILL:
+            assert(num_src_operands == 1);
+            m->s_type = SEG_SPILL;
+            m->pipeId = GLBMEM_PIPE;
+            m->latency.set(w->computeUnit->shader->ticks(1));
+            {
+                for (int lane = 0; lane < VSZ; ++lane) {
+                    if (mask[lane]) {
+                        assert(m->addr[lane] < w->spillSizePerItem);
+
+                        m->addr[lane] = m->addr[lane] * w->spillWidth +
+                                        lane * sizeof(CType) + w->spillBase;
+                    }
+                }
+            }
+
+            w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+            w->outstanding_reqs_wr_gm++;
+            w->wr_gm_reqs_in_pipe--;
+            break;
+
+          case Brig::BRIG_SEGMENT_GROUP:
+            m->s_type = SEG_SHARED;
+            m->pipeId = LDSMEM_PIPE;
+            m->latency.set(w->computeUnit->shader->ticks(24));
+            w->computeUnit->localMemoryPipe.getLMReqFIFO().push(m);
+            w->outstanding_reqs_wr_lm++;
+            w->wr_lm_reqs_in_pipe--;
+            break;
+
+          case Brig::BRIG_SEGMENT_PRIVATE:
+            m->s_type = SEG_PRIVATE;
+            m->pipeId = GLBMEM_PIPE;
+            m->latency.set(w->computeUnit->shader->ticks(1));
+            {
+                for (int lane = 0; lane < VSZ; ++lane) {
+                    if (mask[lane]) {
+                        assert(m->addr[lane] < w->privSizePerItem);
+                        m->addr[lane] = m->addr[lane] + lane *
+                            sizeof(CType)+w->privBase;
+                    }
+                }
+            }
+
+            w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+            w->outstanding_reqs_wr_gm++;
+            w->wr_gm_reqs_in_pipe--;
+            break;
+
+          default:
+            fatal("Store to unsupported segment %d\n", this->segment);
+        }
+
+        w->outstanding_reqs++;
+        w->mem_reqs_in_pipe--;
+    }
+
+    template<typename OperationType, typename SrcDataType,
+             typename AddrRegOperandType>
+    void
+    StInst<OperationType, SrcDataType,
+           AddrRegOperandType>::generateDisassembly()
+    {
+        switch (num_src_operands) {
+          case 1:
+            this->disassembly = csprintf("%s_%s_%s %s,%s", this->opcode,
+                                         segmentNames[this->segment],
+                                         OperationType::label,
+                                         this->src.disassemble(),
+                                         this->addr.disassemble());
+            break;
+          case 2:
+            this->disassembly = csprintf("%s_%s_%s (%s,%s), %s", this->opcode,
+                                         segmentNames[this->segment],
+                                         OperationType::label,
+                                         this->src_vect[0].disassemble(),
+                                         this->src_vect[1].disassemble(),
+                                         this->addr.disassemble());
+            break;
+          case 4:
+            this->disassembly = csprintf("%s_%s_%s (%s,%s,%s,%s), %s",
+                                         this->opcode,
+                                         segmentNames[this->segment],
+                                         OperationType::label,
+                                         this->src_vect[0].disassemble(),
+                                         this->src_vect[1].disassemble(),
+                                         this->src_vect[2].disassemble(),
+                                         this->src_vect[3].disassemble(),
+                                         this->addr.disassemble());
+            break;
+          default: fatal("Bad ld register src operand, num vector operands: "
+                         "%d \n", num_src_operands);
+            break;
+        }
+    }
+
+    template<typename DataType, typename AddrRegOperandType, int NumSrcOperands,
+             bool HasDst>
+    void
+    AtomicInst<DataType, AddrRegOperandType, NumSrcOperands,
+        HasDst>::execute(GPUDynInstPtr gpuDynInst)
+    {
+        typedef typename DataType::CType CType;
+
+        Wavefront *w = gpuDynInst->wavefront();
+
+        GPUDynInstPtr m = gpuDynInst;
+
+        this->addr.calcVector(w, m->addr);
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            ((CType *)m->a_data)[lane] =
+                this->src[0].template get<CType>(w, lane);
+        }
+
+        // load second source operand for CAS
+        if (NumSrcOperands > 1) {
+            for (int lane = 0; lane < VSZ; ++lane) {
+                ((CType*)m->x_data)[lane] =
+                    this->src[1].template get<CType>(w, lane);
+            }
+        }
+
+        assert(NumSrcOperands <= 2);
+
+        m->m_op = this->opType;
+        m->m_type = DataType::memType;
+        m->v_type = DataType::vgprType;
+
+        m->exec_mask = w->execMask();
+        m->statusBitVector = 0;
+        m->equiv = 0;  // atomics don't have an equivalence class operand
+        m->n_reg = 1;
+        m->memoryOrder = getGenericMemoryOrder(this->memoryOrder);
+
+        m->scope = getGenericMemoryScope(this->memoryScope);
+
+        if (HasDst) {
+            m->dst_reg = this->dest.regIndex();
+        }
+
+        m->simdId = w->simdId;
+        m->wfSlotId = w->wfSlotId;
+        m->wfDynId = w->wfDynId;
+        m->kern_id = w->kern_id;
+        m->cu_id = w->computeUnit->cu_id;
+        m->latency.init(&w->computeUnit->shader->tick_cnt);
+
+        switch (this->segment) {
+          case Brig::BRIG_SEGMENT_GLOBAL:
+            m->s_type = SEG_GLOBAL;
+            m->latency.set(w->computeUnit->shader->ticks(64));
+            m->pipeId = GLBMEM_PIPE;
+
+            w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+            w->outstanding_reqs_wr_gm++;
+            w->wr_gm_reqs_in_pipe--;
+            w->outstanding_reqs_rd_gm++;
+            w->rd_gm_reqs_in_pipe--;
+            break;
+
+          case Brig::BRIG_SEGMENT_GROUP:
+            m->s_type = SEG_SHARED;
+            m->pipeId = LDSMEM_PIPE;
+            m->latency.set(w->computeUnit->shader->ticks(24));
+            w->computeUnit->localMemoryPipe.getLMReqFIFO().push(m);
+            w->outstanding_reqs_wr_lm++;
+            w->wr_lm_reqs_in_pipe--;
+            w->outstanding_reqs_rd_lm++;
+            w->rd_lm_reqs_in_pipe--;
+            break;
+
+          default:
+            fatal("Atomic op to unsupported segment %d\n",
+                  this->segment);
+        }
+
+        w->outstanding_reqs++;
+        w->mem_reqs_in_pipe--;
+    }
+
+    const char* atomicOpToString(Brig::BrigAtomicOperation atomicOp);
+
+    template<typename DataType, typename AddrRegOperandType, int NumSrcOperands,
+             bool HasDst>
+    void
+    AtomicInst<DataType, AddrRegOperandType, NumSrcOperands,
+               HasDst>::generateDisassembly()
+    {
+        if (HasDst) {
+            this->disassembly =
+                csprintf("%s_%s_%s_%s %s,%s", this->opcode,
+                         atomicOpToString(this->atomicOperation),
+                         segmentNames[this->segment],
+                         DataType::label, this->dest.disassemble(),
+                         this->addr.disassemble());
+        } else {
+            this->disassembly =
+                csprintf("%s_%s_%s_%s %s", this->opcode,
+                         atomicOpToString(this->atomicOperation),
+                         segmentNames[this->segment],
+                         DataType::label, this->addr.disassemble());
+        }
+
+        for (int i = 0; i < NumSrcOperands; ++i) {
+            this->disassembly += ",";
+            this->disassembly += this->src[i].disassemble();
+        }
+    }
+} // namespace HsailISA
diff --git a/src/arch/hsail/insts/pseudo_inst.cc b/src/arch/hsail/insts/pseudo_inst.cc
new file mode 100644
index 000000000..9506a80ab
--- /dev/null
+++ b/src/arch/hsail/insts/pseudo_inst.cc
@@ -0,0 +1,787 @@
+/*
+ * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Marc Orr
+ */
+
+#include <csignal>
+
+#include "arch/hsail/insts/decl.hh"
+#include "arch/hsail/insts/mem.hh"
+
+namespace HsailISA
+{
+    // Pseudo (or magic) instructions are overloaded on the hsail call
+    // instruction, because of its flexible parameter signature.
+
+    // To add a new magic instruction:
+    // 1. Add an entry to the enum.
+    // 2. Implement it in the switch statement below (Call::exec).
+    // 3. Add a utility function to hsa/hsail-gpu-compute/util/magicinst.h,
+    //    so its easy to call from an OpenCL kernel.
+
+    // This enum should be identical to the enum in
+    // hsa/hsail-gpu-compute/util/magicinst.h
+    enum
+    {
+        MAGIC_PRINT_WF_32 = 0,
+        MAGIC_PRINT_WF_64,
+        MAGIC_PRINT_LANE,
+        MAGIC_PRINT_LANE_64,
+        MAGIC_PRINT_WF_FLOAT,
+        MAGIC_SIM_BREAK,
+        MAGIC_PREF_SUM,
+        MAGIC_REDUCTION,
+        MAGIC_MASKLANE_LOWER,
+        MAGIC_MASKLANE_UPPER,
+        MAGIC_JOIN_WF_BAR,
+        MAGIC_WAIT_WF_BAR,
+        MAGIC_PANIC,
+        MAGIC_ATOMIC_NR_ADD_GLOBAL_U32_REG,
+        MAGIC_ATOMIC_NR_ADD_GROUP_U32_REG,
+        MAGIC_LOAD_GLOBAL_U32_REG,
+        MAGIC_XACT_CAS_LD,
+        MAGIC_MOST_SIG_THD,
+        MAGIC_MOST_SIG_BROADCAST,
+        MAGIC_PRINT_WFID_32,
+        MAGIC_PRINT_WFID_64
+    };
+
+    void
+    Call::execPseudoInst(Wavefront *w, GPUDynInstPtr gpuDynInst)
+    {
+        const VectorMask &mask = w->get_pred();
+
+        int op = 0;
+        bool got_op = false;
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (mask[lane]) {
+                int src_val0 = src1.get<int>(w, lane, 0);
+                if (got_op) {
+                    if (src_val0 != op) {
+                        fatal("Multiple magic instructions per PC not "
+                              "supported\n");
+                    }
+                } else {
+                    op = src_val0;
+                    got_op = true;
+                }
+            }
+        }
+
+        switch(op) {
+          case MAGIC_PRINT_WF_32:
+            MagicPrintWF32(w);
+            break;
+          case MAGIC_PRINT_WF_64:
+            MagicPrintWF64(w);
+            break;
+          case MAGIC_PRINT_LANE:
+            MagicPrintLane(w);
+            break;
+          case MAGIC_PRINT_LANE_64:
+            MagicPrintLane64(w);
+            break;
+          case MAGIC_PRINT_WF_FLOAT:
+            MagicPrintWFFloat(w);
+            break;
+          case MAGIC_SIM_BREAK:
+            MagicSimBreak(w);
+            break;
+          case MAGIC_PREF_SUM:
+            MagicPrefixSum(w);
+            break;
+          case MAGIC_REDUCTION:
+            MagicReduction(w);
+            break;
+          case MAGIC_MASKLANE_LOWER:
+            MagicMaskLower(w);
+            break;
+          case MAGIC_MASKLANE_UPPER:
+            MagicMaskUpper(w);
+            break;
+          case MAGIC_JOIN_WF_BAR:
+            MagicJoinWFBar(w);
+            break;
+          case MAGIC_WAIT_WF_BAR:
+            MagicWaitWFBar(w);
+            break;
+          case MAGIC_PANIC:
+            MagicPanic(w);
+            break;
+
+          // atomic instructions
+          case MAGIC_ATOMIC_NR_ADD_GLOBAL_U32_REG:
+            MagicAtomicNRAddGlobalU32Reg(w, gpuDynInst);
+            break;
+
+          case MAGIC_ATOMIC_NR_ADD_GROUP_U32_REG:
+            MagicAtomicNRAddGroupU32Reg(w, gpuDynInst);
+            break;
+
+          case MAGIC_LOAD_GLOBAL_U32_REG:
+            MagicLoadGlobalU32Reg(w, gpuDynInst);
+            break;
+
+          case MAGIC_XACT_CAS_LD:
+            MagicXactCasLd(w);
+            break;
+
+          case MAGIC_MOST_SIG_THD:
+            MagicMostSigThread(w);
+            break;
+
+          case MAGIC_MOST_SIG_BROADCAST:
+            MagicMostSigBroadcast(w);
+            break;
+
+          case MAGIC_PRINT_WFID_32:
+            MagicPrintWF32ID(w);
+            break;
+
+          case MAGIC_PRINT_WFID_64:
+            MagicPrintWFID64(w);
+            break;
+
+          default: fatal("unrecognized magic instruction: %d\n", op);
+        }
+    }
+
+    void
+    Call::MagicPrintLane(Wavefront *w)
+    {
+    #if TRACING_ON
+        const VectorMask &mask = w->get_pred();
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (mask[lane]) {
+                int src_val1 = src1.get<int>(w, lane, 1);
+                int src_val2 = src1.get<int>(w, lane, 2);
+                if (src_val2) {
+                    DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: 0x%x\n",
+                             disassemble(), w->computeUnit->cu_id, w->simdId,
+                             w->wfSlotId, lane, src_val1);
+                } else {
+                    DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: %d\n",
+                             disassemble(), w->computeUnit->cu_id, w->simdId,
+                             w->wfSlotId, lane, src_val1);
+                }
+            }
+        }
+    #endif
+    }
+
+    void
+    Call::MagicPrintLane64(Wavefront *w)
+    {
+    #if TRACING_ON
+        const VectorMask &mask = w->get_pred();
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (mask[lane]) {
+                int64_t src_val1 = src1.get<int64_t>(w, lane, 1);
+                int src_val2 = src1.get<int>(w, lane, 2);
+                if (src_val2) {
+                    DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: 0x%x\n",
+                             disassemble(), w->computeUnit->cu_id, w->simdId,
+                             w->wfSlotId, lane, src_val1);
+                } else {
+                    DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: %d\n",
+                             disassemble(), w->computeUnit->cu_id, w->simdId,
+                             w->wfSlotId, lane, src_val1);
+                }
+            }
+        }
+    #endif
+    }
+
+    void
+    Call::MagicPrintWF32(Wavefront *w)
+    {
+    #if TRACING_ON
+        const VectorMask &mask = w->get_pred();
+        std::string res_str;
+        res_str = csprintf("krl_prt (%s)\n", disassemble());
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (!(lane & 7)) {
+                res_str += csprintf("DB%03d: ", (int)w->wfDynId);
+            }
+
+            if (mask[lane]) {
+                int src_val1 = src1.get<int>(w, lane, 1);
+                int src_val2 = src1.get<int>(w, lane, 2);
+
+                if (src_val2) {
+                    res_str += csprintf("%08x", src_val1);
+                } else {
+                    res_str += csprintf("%08d", src_val1);
+                }
+            } else {
+                res_str += csprintf("xxxxxxxx");
+            }
+
+            if ((lane & 7) == 7) {
+                res_str += csprintf("\n");
+            } else {
+                res_str += csprintf(" ");
+            }
+        }
+
+        res_str += "\n\n";
+        DPRINTFN(res_str.c_str());
+    #endif
+    }
+
+    void
+    Call::MagicPrintWF32ID(Wavefront *w)
+    {
+    #if TRACING_ON
+        const VectorMask &mask = w->get_pred();
+        std::string res_str;
+        int src_val3 = -1;
+        res_str = csprintf("krl_prt (%s)\n", disassemble());
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (!(lane & 7)) {
+                res_str += csprintf("DB%03d: ", (int)w->wfDynId);
+            }
+
+            if (mask[lane]) {
+                int src_val1 = src1.get<int>(w, lane, 1);
+                int src_val2 = src1.get<int>(w, lane, 2);
+                src_val3 = src1.get<int>(w, lane, 3);
+
+                if (src_val2) {
+                    res_str += csprintf("%08x", src_val1);
+                } else {
+                    res_str += csprintf("%08d", src_val1);
+                }
+            } else {
+                res_str += csprintf("xxxxxxxx");
+            }
+
+            if ((lane & 7) == 7) {
+                res_str += csprintf("\n");
+            } else {
+                res_str += csprintf(" ");
+            }
+        }
+
+        res_str += "\n\n";
+        if (w->wfDynId == src_val3) {
+            DPRINTFN(res_str.c_str());
+        }
+    #endif
+    }
+
+    void
+    Call::MagicPrintWF64(Wavefront *w)
+    {
+    #if TRACING_ON
+        const VectorMask &mask = w->get_pred();
+        std::string res_str;
+        res_str = csprintf("krl_prt (%s)\n", disassemble());
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (!(lane & 3)) {
+                res_str += csprintf("DB%03d: ", (int)w->wfDynId);
+            }
+
+            if (mask[lane]) {
+                int64_t src_val1 = src1.get<int64_t>(w, lane, 1);
+                int src_val2 = src1.get<int>(w, lane, 2);
+
+                if (src_val2) {
+                    res_str += csprintf("%016x", src_val1);
+                } else {
+                    res_str += csprintf("%016d", src_val1);
+                }
+            } else {
+                res_str += csprintf("xxxxxxxxxxxxxxxx");
+            }
+
+            if ((lane & 3) == 3) {
+                res_str += csprintf("\n");
+            } else {
+                res_str += csprintf(" ");
+            }
+        }
+
+        res_str += "\n\n";
+        DPRINTFN(res_str.c_str());
+    #endif
+    }
+
+    void
+    Call::MagicPrintWFID64(Wavefront *w)
+    {
+    #if TRACING_ON
+        const VectorMask &mask = w->get_pred();
+        std::string res_str;
+        int src_val3 = -1;
+        res_str = csprintf("krl_prt (%s)\n", disassemble());
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (!(lane & 3)) {
+                res_str += csprintf("DB%03d: ", (int)w->wfDynId);
+            }
+
+            if (mask[lane]) {
+                int64_t src_val1 = src1.get<int64_t>(w, lane, 1);
+                int src_val2 = src1.get<int>(w, lane, 2);
+                src_val3 = src1.get<int>(w, lane, 3);
+
+                if (src_val2) {
+                    res_str += csprintf("%016x", src_val1);
+                } else {
+                    res_str += csprintf("%016d", src_val1);
+                }
+            } else {
+                res_str += csprintf("xxxxxxxxxxxxxxxx");
+            }
+
+            if ((lane & 3) == 3) {
+                res_str += csprintf("\n");
+            } else {
+                res_str += csprintf(" ");
+            }
+        }
+
+        res_str += "\n\n";
+        if (w->wfDynId == src_val3) {
+            DPRINTFN(res_str.c_str());
+        }
+    #endif
+    }
+
+    void
+    Call::MagicPrintWFFloat(Wavefront *w)
+    {
+    #if TRACING_ON
+        const VectorMask &mask = w->get_pred();
+        std::string res_str;
+        res_str = csprintf("krl_prt (%s)\n", disassemble());
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (!(lane & 7)) {
+                res_str += csprintf("DB%03d: ", (int)w->wfDynId);
+            }
+
+            if (mask[lane]) {
+                float src_val1 = src1.get<float>(w, lane, 1);
+                res_str += csprintf("%08f", src_val1);
+            } else {
+                res_str += csprintf("xxxxxxxx");
+            }
+
+            if ((lane & 7) == 7) {
+                res_str += csprintf("\n");
+            } else {
+                res_str += csprintf(" ");
+            }
+        }
+
+        res_str += "\n\n";
+        DPRINTFN(res_str.c_str());
+    #endif
+    }
+
+    // raises a signal that GDB will catch
+    // when done with the break, type "signal 0" in gdb to continue
+    void
+    Call::MagicSimBreak(Wavefront *w)
+    {
+        std::string res_str;
+        // print out state for this wavefront and then break
+        res_str = csprintf("Breakpoint encountered for wavefront %i\n",
+                           w->wfSlotId);
+
+        res_str += csprintf("  Kern ID: %i\n", w->kern_id);
+        res_str += csprintf("  Phase ID: %i\n", w->simdId);
+        res_str += csprintf("  Executing on CU #%i\n", w->computeUnit->cu_id);
+        res_str += csprintf("  Exec mask: ");
+
+        for (int i = VSZ - 1; i >= 0; --i) {
+            if (w->execMask(i))
+                res_str += "1";
+            else
+                res_str += "0";
+
+            if ((i & 7) == 7)
+                res_str += " ";
+        }
+
+        res_str += csprintf("(0x%016llx)\n", w->execMask().to_ullong());
+
+        res_str += "\nHelpful debugging hints:\n";
+        res_str += "   Check out w->s_reg / w->d_reg for register state\n";
+
+        res_str += "\n\n";
+        DPRINTFN(res_str.c_str());
+        fflush(stdout);
+
+        raise(SIGTRAP);
+    }
+
+    void
+    Call::MagicPrefixSum(Wavefront *w)
+    {
+        const VectorMask &mask = w->get_pred();
+        int res = 0;
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (mask[lane]) {
+                int src_val1 = src1.get<int>(w, lane, 1);
+                dest.set<int>(w, lane, res);
+                res += src_val1;
+            }
+        }
+    }
+
+    void
+    Call::MagicReduction(Wavefront *w)
+    {
+        // reduction magic instruction
+        //   The reduction instruction takes up to 64 inputs (one from
+        //   each thread in a WF) and sums them. It returns the sum to
+        //   each thread in the WF.
+        const VectorMask &mask = w->get_pred();
+        int res = 0;
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (mask[lane]) {
+                int src_val1 = src1.get<int>(w, lane, 1);
+                res += src_val1;
+            }
+        }
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (mask[lane]) {
+                dest.set<int>(w, lane, res);
+            }
+        }
+    }
+
+    void
+    Call::MagicMaskLower(Wavefront *w)
+    {
+        const VectorMask &mask = w->get_pred();
+        int res = 0;
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (mask[lane]) {
+                int src_val1 = src1.get<int>(w, lane, 1);
+
+                if (src_val1) {
+                    if (lane < (VSZ/2)) {
+                        res = res | ((uint32_t)(1) << lane);
+                    }
+                }
+            }
+        }
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (mask[lane]) {
+                dest.set<int>(w, lane, res);
+            }
+        }
+    }
+
+    void
+    Call::MagicMaskUpper(Wavefront *w)
+    {
+        const VectorMask &mask = w->get_pred();
+        int res = 0;
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (mask[lane]) {
+                int src_val1 = src1.get<int>(w, lane, 1);
+
+                if (src_val1) {
+                    if (lane >= (VSZ/2)) {
+                        res = res | ((uint32_t)(1) << (lane - (VSZ/2)));
+                    }
+                }
+            }
+        }
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (mask[lane]) {
+                dest.set<int>(w, lane, res);
+            }
+        }
+    }
+
+    void
+    Call::MagicJoinWFBar(Wavefront *w)
+    {
+        const VectorMask &mask = w->get_pred();
+        int max_cnt = 0;
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (mask[lane]) {
+                w->bar_cnt[lane]++;
+
+                if (w->bar_cnt[lane] > max_cnt) {
+                    max_cnt = w->bar_cnt[lane];
+                }
+            }
+        }
+
+        if (max_cnt > w->max_bar_cnt) {
+            w->max_bar_cnt = max_cnt;
+        }
+    }
+
+    void
+    Call::MagicWaitWFBar(Wavefront *w)
+    {
+        const VectorMask &mask = w->get_pred();
+        int max_cnt = 0;
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (mask[lane]) {
+                w->bar_cnt[lane]--;
+            }
+
+            if (w->bar_cnt[lane] > max_cnt) {
+                max_cnt = w->bar_cnt[lane];
+            }
+        }
+
+        if (max_cnt < w->max_bar_cnt) {
+            w->max_bar_cnt = max_cnt;
+        }
+
+        w->instructionBuffer.erase(w->instructionBuffer.begin() + 1,
+                                   w->instructionBuffer.end());
+        if (w->pendingFetch)
+            w->dropFetch = true;
+    }
+
+    void
+    Call::MagicPanic(Wavefront *w)
+    {
+        const VectorMask &mask = w->get_pred();
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (mask[lane]) {
+                int src_val1 = src1.get<int>(w, lane, 1);
+                panic("OpenCL Code failed assertion #%d. Triggered by lane %s",
+                      src_val1, lane);
+            }
+        }
+    }
+
+    void
+    Call::calcAddr(Wavefront *w, GPUDynInstPtr m)
+    {
+        // the address is in src1 | src2
+        for (int lane = 0; lane < VSZ; ++lane) {
+            int src_val1 = src1.get<int>(w, lane, 1);
+            int src_val2 = src1.get<int>(w, lane, 2);
+            Addr addr = (((Addr) src_val1) << 32) | ((Addr) src_val2);
+
+            m->addr[lane] = addr;
+        }
+
+    }
+
+    void
+    Call::MagicAtomicNRAddGlobalU32Reg(Wavefront *w, GPUDynInstPtr gpuDynInst)
+    {
+        GPUDynInstPtr m = gpuDynInst;
+
+        calcAddr(w, m);
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            ((int*)m->a_data)[lane] = src1.get<int>(w, lane, 3);
+        }
+
+        m->m_op = brigAtomicToMemOpType(Brig::BRIG_OPCODE_ATOMICNORET,
+                                        Brig::BRIG_ATOMIC_ADD);
+        m->m_type = U32::memType;
+        m->v_type = U32::vgprType;
+
+        m->exec_mask = w->execMask();
+        m->statusBitVector = 0;
+        m->equiv = 0;  // atomics don't have an equivalence class operand
+        m->n_reg = 1;
+        m->memoryOrder = Enums::MEMORY_ORDER_NONE;
+        m->scope = Enums::MEMORY_SCOPE_NONE;
+
+        m->simdId = w->simdId;
+        m->wfSlotId = w->wfSlotId;
+        m->wfDynId = w->wfDynId;
+        m->latency.init(&w->computeUnit->shader->tick_cnt);
+
+        m->s_type = SEG_GLOBAL;
+        m->pipeId = GLBMEM_PIPE;
+        m->latency.set(w->computeUnit->shader->ticks(64));
+        w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+        w->outstanding_reqs_wr_gm++;
+        w->wr_gm_reqs_in_pipe--;
+        w->outstanding_reqs_rd_gm++;
+        w->rd_gm_reqs_in_pipe--;
+        w->outstanding_reqs++;
+        w->mem_reqs_in_pipe--;
+    }
+
+    void
+    Call::MagicAtomicNRAddGroupU32Reg(Wavefront *w, GPUDynInstPtr gpuDynInst)
+    {
+        GPUDynInstPtr m = gpuDynInst;
+        calcAddr(w, m);
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            ((int*)m->a_data)[lane] = src1.get<int>(w, lane, 1);
+        }
+
+        m->m_op = brigAtomicToMemOpType(Brig::BRIG_OPCODE_ATOMICNORET,
+                                        Brig::BRIG_ATOMIC_ADD);
+        m->m_type = U32::memType;
+        m->v_type = U32::vgprType;
+
+        m->exec_mask = w->execMask();
+        m->statusBitVector = 0;
+        m->equiv = 0;  // atomics don't have an equivalence class operand
+        m->n_reg = 1;
+        m->memoryOrder = Enums::MEMORY_ORDER_NONE;
+        m->scope = Enums::MEMORY_SCOPE_NONE;
+
+        m->simdId = w->simdId;
+        m->wfSlotId = w->wfSlotId;
+        m->wfDynId = w->wfDynId;
+        m->latency.init(&w->computeUnit->shader->tick_cnt);
+
+        m->s_type = SEG_GLOBAL;
+        m->pipeId = GLBMEM_PIPE;
+        m->latency.set(w->computeUnit->shader->ticks(64));
+        w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+        w->outstanding_reqs_wr_gm++;
+        w->wr_gm_reqs_in_pipe--;
+        w->outstanding_reqs_rd_gm++;
+        w->rd_gm_reqs_in_pipe--;
+        w->outstanding_reqs++;
+        w->mem_reqs_in_pipe--;
+    }
+
+    void
+    Call::MagicLoadGlobalU32Reg(Wavefront *w, GPUDynInstPtr gpuDynInst)
+    {
+        GPUDynInstPtr m = gpuDynInst;
+        // calculate the address
+        calcAddr(w, m);
+
+        m->m_op = Enums::MO_LD;
+        m->m_type = U32::memType;  //MemDataType::memType;
+        m->v_type = U32::vgprType; //DestDataType::vgprType;
+
+        m->exec_mask = w->execMask();
+        m->statusBitVector = 0;
+        m->equiv = 0;
+        m->n_reg = 1;
+        m->memoryOrder = Enums::MEMORY_ORDER_NONE;
+        m->scope = Enums::MEMORY_SCOPE_NONE;
+
+        // FIXME
+        //m->dst_reg = this->dest.regIndex();
+
+        m->simdId = w->simdId;
+        m->wfSlotId = w->wfSlotId;
+        m->wfDynId = w->wfDynId;
+        m->latency.init(&w->computeUnit->shader->tick_cnt);
+
+        m->s_type = SEG_GLOBAL;
+        m->pipeId = GLBMEM_PIPE;
+        m->latency.set(w->computeUnit->shader->ticks(1));
+        w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+        w->outstanding_reqs_rd_gm++;
+        w->rd_gm_reqs_in_pipe--;
+        w->outstanding_reqs++;
+        w->mem_reqs_in_pipe--;
+    }
+
+    void
+    Call::MagicXactCasLd(Wavefront *w)
+    {
+        const VectorMask &mask = w->get_pred();
+        int src_val1 = 0;
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (mask[lane]) {
+                src_val1 = src1.get<int>(w, lane, 1);
+                break;
+            }
+        }
+
+        if (!w->computeUnit->xactCasLoadMap.count(src_val1)) {
+            w->computeUnit->xactCasLoadMap[src_val1] = ComputeUnit::waveQueue();
+            w->computeUnit->xactCasLoadMap[src_val1].waveIDQueue.clear();
+        }
+
+        w->computeUnit->xactCasLoadMap[src_val1].waveIDQueue
+            .push_back(ComputeUnit::waveIdentifier(w->simdId, w->wfSlotId));
+    }
+
+    void
+    Call::MagicMostSigThread(Wavefront *w)
+    {
+        const VectorMask &mask = w->get_pred();
+        unsigned mst = true;
+
+        for (int lane = VSZ - 1; lane >= 0; --lane) {
+            if (mask[lane]) {
+                dest.set<int>(w, lane, mst);
+                mst = false;
+            }
+        }
+    }
+
+    void
+    Call::MagicMostSigBroadcast(Wavefront *w)
+    {
+        const VectorMask &mask = w->get_pred();
+        int res = 0;
+        bool got_res = false;
+
+        for (int lane = VSZ - 1; lane >= 0; --lane) {
+            if (mask[lane]) {
+                if (!got_res) {
+                    res = src1.get<int>(w, lane, 1);
+                    got_res = true;
+                }
+                dest.set<int>(w, lane, res);
+            }
+        }
+    }
+
+} // namespace HsailISA
diff --git a/src/arch/hsail/operand.cc b/src/arch/hsail/operand.cc
new file mode 100644
index 000000000..d0e6c5541
--- /dev/null
+++ b/src/arch/hsail/operand.cc
@@ -0,0 +1,449 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#include "arch/hsail/operand.hh"
+
+using namespace Brig;
+
+bool
+BaseRegOperand::init(unsigned opOffset, const BrigObject *obj,
+                     unsigned &maxRegIdx, char _regFileChar)
+{
+    regFileChar = _regFileChar;
+    const BrigOperand *brigOp = obj->getOperand(opOffset);
+
+    if (brigOp->kind != BRIG_KIND_OPERAND_REGISTER)
+        return false;
+
+    const BrigOperandRegister *brigRegOp = (const BrigOperandRegister*)brigOp;
+
+    regIdx = brigRegOp->regNum;
+
+    DPRINTF(GPUReg, "Operand: regNum: %d, kind: %d\n", regIdx,
+            brigRegOp->regKind);
+
+    maxRegIdx = std::max(maxRegIdx, regIdx);
+
+    return true;
+}
+
+void
+ListOperand::init(unsigned opOffset, const BrigObject *obj)
+{
+    const BrigOperand *brigOp = (const BrigOperand*)obj->getOperand(opOffset);
+
+    switch (brigOp->kind) {
+      case BRIG_KIND_OPERAND_CODE_LIST:
+        {
+            const BrigOperandCodeList *opList =
+                (const BrigOperandCodeList*)brigOp;
+
+            const Brig::BrigData *oprnd_data =
+                obj->getBrigBaseData(opList->elements);
+
+            // Note: for calls Dest list of operands could be size of 0.
+            elementCount = oprnd_data->byteCount / 4;
+
+            DPRINTF(GPUReg, "Operand Code List: # elements: %d\n",
+                    elementCount);
+
+            for (int i = 0; i < elementCount; ++i) {
+                unsigned *data_offset =
+                    (unsigned*)obj->getData(opList->elements + 4 * (i + 1));
+
+                const BrigDirectiveVariable *p =
+                    (const BrigDirectiveVariable*)obj->
+                    getCodeSectionEntry(*data_offset);
+
+                StorageElement *se = obj->currentCode->storageMap->
+                    findSymbol(BRIG_SEGMENT_ARG, p);
+
+                assert(se);
+                callArgs.push_back(se);
+            }
+        }
+        break;
+      default:
+        fatal("ListOperand: bad operand kind %d\n", brigOp->kind);
+    }
+}
+
+std::string
+ListOperand::disassemble()
+{
+    std::string res_str("");
+
+    for (auto it : callArgs) {
+        res_str += csprintf("%s ", it->name.c_str());
+    }
+
+    return res_str;
+}
+
+void
+FunctionRefOperand::init(unsigned opOffset, const BrigObject *obj)
+{
+    const BrigOperand *baseOp = obj->getOperand(opOffset);
+
+    if (baseOp->kind != BRIG_KIND_OPERAND_CODE_REF) {
+        fatal("FunctionRefOperand: bad operand kind %d\n", baseOp->kind);
+    }
+
+    const BrigOperandCodeRef *brigOp = (const BrigOperandCodeRef*)baseOp;
+
+    const BrigDirectiveExecutable *p =
+        (const BrigDirectiveExecutable*)obj->getCodeSectionEntry(brigOp->ref);
+
+    func_name = obj->getString(p->name);
+}
+
+std::string
+FunctionRefOperand::disassemble()
+{
+    DPRINTF(GPUReg, "Operand Func-ref name: %s\n", func_name);
+
+    return csprintf("%s", func_name);
+}
+
+bool
+BaseRegOperand::init_from_vect(unsigned opOffset, const BrigObject *obj,
+                               int at, unsigned &maxRegIdx, char _regFileChar)
+{
+    regFileChar = _regFileChar;
+    const BrigOperand *brigOp = obj->getOperand(opOffset);
+
+    if (brigOp->kind != BRIG_KIND_OPERAND_OPERAND_LIST)
+        return false;
+
+
+    const Brig::BrigOperandOperandList *brigRegVecOp =
+         (const Brig::BrigOperandOperandList*)brigOp;
+
+    unsigned *data_offset =
+        (unsigned*)obj->getData(brigRegVecOp->elements + 4 * (at + 1));
+
+    const BrigOperand *p =
+        (const BrigOperand*)obj->getOperand(*data_offset);
+    if (p->kind != BRIG_KIND_OPERAND_REGISTER) {
+        return false;
+    }
+
+    const BrigOperandRegister *brigRegOp =(const BrigOperandRegister*)p;
+
+    regIdx = brigRegOp->regNum;
+
+    DPRINTF(GPUReg, "Operand: regNum: %d, kind: %d \n", regIdx,
+            brigRegOp->regKind);
+
+    maxRegIdx = std::max(maxRegIdx, regIdx);
+
+    return true;
+}
+
+void
+BaseRegOperand::initWithStrOffset(unsigned strOffset, const BrigObject *obj,
+                     unsigned &maxRegIdx, char _regFileChar)
+{
+    const char *name = obj->getString(strOffset);
+    char *endptr;
+    regIdx = strtoul(name + 2, &endptr, 10);
+
+    if (name[0] != '$' || name[1] != _regFileChar) {
+        fatal("register operand parse error on \"%s\"\n", name);
+    }
+
+    maxRegIdx = std::max(maxRegIdx, regIdx);
+}
+
+unsigned SRegOperand::maxRegIdx;
+unsigned DRegOperand::maxRegIdx;
+unsigned CRegOperand::maxRegIdx;
+
+std::string
+SRegOperand::disassemble()
+{
+    return csprintf("$s%d", regIdx);
+}
+
+std::string
+DRegOperand::disassemble()
+{
+    return csprintf("$d%d", regIdx);
+}
+
+std::string
+CRegOperand::disassemble()
+{
+    return csprintf("$c%d", regIdx);
+}
+
+BrigRegOperandInfo
+findRegDataType(unsigned opOffset, const BrigObject *obj)
+{
+    const BrigOperand *baseOp = obj->getOperand(opOffset);
+
+    switch (baseOp->kind) {
+      case BRIG_KIND_OPERAND_REGISTER:
+        {
+            const BrigOperandRegister *op = (BrigOperandRegister*)baseOp;
+
+            return BrigRegOperandInfo((BrigKind16_t)baseOp->kind,
+                                      (BrigRegisterKind)op->regKind);
+        }
+        break;
+
+      case BRIG_KIND_OPERAND_OPERAND_LIST:
+        {
+             const BrigOperandOperandList *op =
+                (BrigOperandOperandList*)baseOp;
+             const BrigData *data_p = (BrigData*)obj->getData(op->elements);
+
+
+             int num_operands = 0;
+             BrigRegisterKind reg_kind = (BrigRegisterKind)0;
+             for (int offset = 0; offset < data_p->byteCount; offset += 4) {
+                 const BrigOperand *op_p = (const BrigOperand *)
+                    obj->getOperand(((int *)data_p->bytes)[offset/4]);
+
+                 if (op_p->kind == BRIG_KIND_OPERAND_REGISTER) {
+                     const BrigOperandRegister *brigRegOp =
+                        (const BrigOperandRegister*)op_p;
+                     reg_kind = (BrigRegisterKind)brigRegOp->regKind;
+                 } else if (op_p->kind == BRIG_KIND_OPERAND_CONSTANT_BYTES) {
+                     uint16_t num_bytes =
+                        ((Brig::BrigOperandConstantBytes*)op_p)->base.byteCount
+                            - sizeof(BrigBase);
+                     if (num_bytes == sizeof(uint32_t)) {
+                         reg_kind = BRIG_REGISTER_KIND_SINGLE;
+                     } else if (num_bytes == sizeof(uint64_t)) {
+                         reg_kind = BRIG_REGISTER_KIND_DOUBLE;
+                     } else {
+                         fatal("OperandList: bad operand size %d\n", num_bytes);
+                     }
+                 } else {
+                     fatal("OperandList: bad operand kind %d\n", op_p->kind);
+                 }
+
+                 num_operands++;
+             }
+             assert(baseOp->kind == BRIG_KIND_OPERAND_OPERAND_LIST);
+
+             return BrigRegOperandInfo((BrigKind16_t)baseOp->kind, reg_kind);
+        }
+        break;
+
+      case BRIG_KIND_OPERAND_ADDRESS:
+        {
+            const BrigOperandAddress *op = (BrigOperandAddress*)baseOp;
+
+            if (!op->reg) {
+                BrigType type = BRIG_TYPE_NONE;
+
+                if (op->symbol) {
+                    const BrigDirective *dir = (BrigDirective*)
+                        obj->getCodeSectionEntry(op->symbol);
+
+                    assert(dir->kind == BRIG_KIND_DIRECTIVE_VARIABLE);
+
+                    const BrigDirectiveVariable *sym =
+                       (const BrigDirectiveVariable*)dir;
+
+                    type = (BrigType)sym->type;
+                }
+                return BrigRegOperandInfo(BRIG_KIND_OPERAND_ADDRESS,
+                                          (BrigType)type);
+            } else {
+                const BrigOperandAddress *b = (const BrigOperandAddress*)baseOp;
+                const BrigOperand *reg = obj->getOperand(b->reg);
+                const BrigOperandRegister *rop = (BrigOperandRegister*)reg;
+
+                return BrigRegOperandInfo(BRIG_KIND_OPERAND_REGISTER,
+                                          (BrigRegisterKind)rop->regKind);
+            }
+        }
+        break;
+
+     default:
+       fatal("AddrOperand: bad operand kind %d\n", baseOp->kind);
+       break;
+   }
+}
+
+void
+AddrOperandBase::parseAddr(const BrigOperandAddress *op, const BrigObject *obj)
+{
+    assert(op->base.kind == BRIG_KIND_OPERAND_ADDRESS);
+
+    const BrigDirective *d =
+        (BrigDirective*)obj->getCodeSectionEntry(op->symbol);
+
+    assert(d->kind == BRIG_KIND_DIRECTIVE_VARIABLE);
+    const BrigDirectiveVariable *sym = (BrigDirectiveVariable*)d;
+    name = obj->getString(sym->name);
+
+    if (sym->segment != BRIG_SEGMENT_ARG) {
+        storageElement =
+            obj->currentCode->storageMap->findSymbol(sym->segment, name);
+        assert(storageElement);
+        offset = 0;
+    } else {
+        // sym->name does not work for BRIG_SEGMENT_ARG for the following case:
+        //
+        //     void foo(int a);
+        //     void bar(double a);
+        //
+        //     foo(...) --> arg_u32 %param_p0;
+        //                  st_arg_u32 $s0, [%param_p0];
+        //                  call &foo (%param_p0);
+        //     bar(...) --> arg_f64 %param_p0;
+        //                  st_arg_u64 $d0, [%param_p0];
+        //                  call &foo (%param_p0);
+        //
+        //  Both functions use the same variable name (param_p0)!!!
+        //
+        //  Maybe this is a bug in the compiler (I don't know).
+        //
+        // Solution:
+        // Use directive pointer (BrigDirectiveVariable) to differentiate 2
+        // versions of param_p0.
+        //
+        // Note this solution is kind of stupid, because we are pulling stuff
+        // out of the brig binary via the directive pointer and putting it into
+        // the symbol table, but now we are indexing the symbol table by the
+        // brig directive pointer! It makes the symbol table sort of pointless.
+        // But I don't want to mess with the rest of the infrastructure, so
+        // let's go with this for now.
+        //
+        // When we update the compiler again, we should see if this problem goes
+        // away. If so, we can fold some of this functionality into the code for
+        // kernel arguments. If not, maybe we can index the symbol name on a
+        // hash of the variable AND function name
+        storageElement = obj->currentCode->
+                 storageMap->findSymbol((Brig::BrigSegment)sym->segment, sym);
+
+        assert(storageElement);
+    }
+}
+
+uint64_t
+AddrOperandBase::calcUniformBase()
+{
+    // start with offset, will be 0 if not specified
+    uint64_t address = offset;
+
+    // add in symbol value if specified
+    if (storageElement) {
+        address += storageElement->offset;
+    }
+
+    return address;
+}
+
+std::string
+AddrOperandBase::disassemble(std::string reg_disassembly)
+{
+    std::string disasm;
+
+    if (offset || reg_disassembly != "") {
+        disasm += "[";
+
+        if (reg_disassembly != "") {
+            disasm += reg_disassembly;
+
+            if (offset > 0) {
+                disasm += "+";
+            }
+        }
+
+        if (offset) {
+            disasm += csprintf("%d", offset);
+        }
+
+        disasm += "]";
+    } else if (name) {
+        disasm += csprintf("[%s]", name);
+    }
+
+    return disasm;
+}
+
+void
+NoRegAddrOperand::init(unsigned opOffset, const BrigObject *obj)
+{
+    const BrigOperand *baseOp = obj->getOperand(opOffset);
+
+    if (baseOp->kind == BRIG_KIND_OPERAND_ADDRESS) {
+        BrigOperandAddress *addrOp = (BrigOperandAddress*)baseOp;
+        parseAddr(addrOp, obj);
+        offset = (uint64_t(addrOp->offset.hi) << 32) |
+                  uint64_t(addrOp->offset.lo);
+    } else {
+        fatal("NoRegAddrOperand: bad operand kind %d\n", baseOp->kind);
+    }
+
+}
+
+std::string
+NoRegAddrOperand::disassemble()
+{
+    return AddrOperandBase::disassemble(std::string(""));
+}
+
+void
+LabelOperand::init(unsigned opOffset, const BrigObject *obj)
+{
+    const BrigOperandCodeRef *op =
+        (const BrigOperandCodeRef*)obj->getOperand(opOffset);
+
+    assert(op->base.kind == BRIG_KIND_OPERAND_CODE_REF);
+
+    const BrigDirective *dir =
+        (const BrigDirective*)obj->getCodeSectionEntry(op->ref);
+
+    assert(dir->kind == BRIG_KIND_DIRECTIVE_LABEL);
+    label = obj->currentCode->refLabel((BrigDirectiveLabel*)dir, obj);
+}
+
+uint32_t
+LabelOperand::getTarget(Wavefront *w, int lane)
+{
+    return label->get();
+}
+
+std::string
+LabelOperand::disassemble()
+{
+    return label->name;
+}
diff --git a/src/arch/hsail/operand.hh b/src/arch/hsail/operand.hh
new file mode 100644
index 000000000..e3d275b10
--- /dev/null
+++ b/src/arch/hsail/operand.hh
@@ -0,0 +1,768 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#ifndef __ARCH_HSAIL_OPERAND_HH__
+#define __ARCH_HSAIL_OPERAND_HH__
+
+/**
+ *  @file operand.hh
+ *
+ *  Defines classes encapsulating HSAIL instruction operands.
+ */
+
+#include <string>
+
+#include "arch/hsail/Brig.h"
+#include "base/trace.hh"
+#include "base/types.hh"
+#include "debug/GPUReg.hh"
+#include "enums/RegisterType.hh"
+#include "gpu-compute/brig_object.hh"
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/hsail_code.hh"
+#include "gpu-compute/shader.hh"
+#include "gpu-compute/vector_register_file.hh"
+#include "gpu-compute/wavefront.hh"
+
+class Label;
+class StorageElement;
+
+class BaseOperand
+{
+  public:
+    Enums::RegisterType registerType;
+    uint32_t regOperandSize;
+    BaseOperand() { registerType = Enums::RT_NONE; regOperandSize = 0; }
+    bool isVectorRegister() { return registerType == Enums::RT_VECTOR; }
+    bool isScalarRegister() { return registerType == Enums::RT_SCALAR; }
+    bool isCondRegister() { return registerType == Enums::RT_CONDITION; }
+    unsigned int regIndex() { return 0; }
+    uint32_t opSize() { return regOperandSize; }
+    virtual ~BaseOperand() { }
+};
+
+class BrigRegOperandInfo
+{
+  public:
+    Brig::BrigKind16_t kind;
+    Brig::BrigType type;
+    Brig::BrigRegisterKind regKind;
+
+    BrigRegOperandInfo(Brig::BrigKind16_t _kind,
+                       Brig::BrigRegisterKind _regKind)
+        : kind(_kind), regKind(_regKind)
+    {
+    }
+
+    BrigRegOperandInfo(Brig::BrigKind16_t _kind, Brig::BrigType _type)
+        : kind(_kind), type(_type)
+    {
+    }
+
+    BrigRegOperandInfo() : kind(Brig::BRIG_KIND_OPERAND_CONSTANT_BYTES),
+                           type(Brig::BRIG_TYPE_NONE)
+    {
+    }
+};
+
+BrigRegOperandInfo findRegDataType(unsigned opOffset, const BrigObject *obj);
+
+class BaseRegOperand : public BaseOperand
+{
+  public:
+    unsigned regIdx;
+    char regFileChar;
+
+    bool init(unsigned opOffset, const BrigObject *obj,
+              unsigned &maxRegIdx, char _regFileChar);
+
+    bool init_from_vect(unsigned opOffset, const BrigObject *obj, int at,
+                        unsigned &maxRegIdx, char _regFileChar);
+
+    void initWithStrOffset(unsigned strOffset, const BrigObject *obj,
+                           unsigned &maxRegIdx, char _regFileChar);
+    unsigned int regIndex() { return regIdx; }
+};
+
+class SRegOperand : public BaseRegOperand
+{
+  public:
+    static unsigned maxRegIdx;
+
+    bool
+    init(unsigned opOffset, const BrigObject *obj)
+    {
+        regOperandSize = sizeof(uint32_t);
+        registerType = Enums::RT_VECTOR;
+
+        return BaseRegOperand::init(opOffset, obj, maxRegIdx, 's');
+    }
+
+    bool
+    init_from_vect(unsigned opOffset, const BrigObject *obj, int at)
+    {
+        regOperandSize = sizeof(uint32_t);
+        registerType = Enums::RT_VECTOR;
+
+        return BaseRegOperand::init_from_vect(opOffset, obj, at, maxRegIdx,
+                                              's');
+    }
+
+    void
+    initWithStrOffset(unsigned strOffset, const BrigObject *obj)
+    {
+        regOperandSize = sizeof(uint32_t);
+        registerType = Enums::RT_VECTOR;
+
+        return BaseRegOperand::initWithStrOffset(strOffset, obj, maxRegIdx,
+                                                 's');
+    }
+
+    template<typename OperandType>
+    OperandType
+    get(Wavefront *w, int lane)
+    {
+        assert(sizeof(OperandType) <= sizeof(uint32_t));
+        assert(regIdx < w->maxSpVgprs);
+        // if OperandType is smaller than 32-bit, we truncate the value
+        OperandType ret;
+        uint32_t vgprIdx;
+
+        switch (sizeof(OperandType)) {
+          case 1: // 1 byte operand
+              vgprIdx = w->remap(regIdx, 1, 1);
+              ret = (w->computeUnit->vrf[w->simdId]->
+                      read<uint32_t>(vgprIdx, lane)) & 0xff;
+            break;
+          case 2: // 2 byte operand
+              vgprIdx = w->remap(regIdx, 2, 1);
+              ret = (w->computeUnit->vrf[w->simdId]->
+                      read<uint32_t>(vgprIdx, lane)) & 0xffff;
+            break;
+          case 4: // 4 byte operand
+              vgprIdx = w->remap(regIdx,sizeof(OperandType), 1);
+              ret = w->computeUnit->vrf[w->simdId]->
+                  read<OperandType>(vgprIdx, lane);
+            break;
+          default:
+            panic("Bad OperandType\n");
+            break;
+        }
+
+        return (OperandType)ret;
+    }
+
+    // special get method for compatibility with LabelOperand
+    uint32_t
+    getTarget(Wavefront *w, int lane)
+    {
+        return get<uint32_t>(w, lane);
+    }
+
+    template<typename OperandType>
+    void set(Wavefront *w, int lane, OperandType &val);
+    std::string disassemble();
+};
+
+template<typename OperandType>
+void
+SRegOperand::set(Wavefront *w, int lane, OperandType &val)
+{
+    DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: $s%d <- %d\n",
+            w->computeUnit->cu_id, w->simdId, w->wfSlotId, lane, regIdx, val);
+
+    assert(sizeof(OperandType) == sizeof(uint32_t));
+    assert(regIdx < w->maxSpVgprs);
+    uint32_t vgprIdx = w->remap(regIdx, sizeof(OperandType), 1);
+    w->computeUnit->vrf[w->simdId]->write<OperandType>(vgprIdx,val,lane);
+}
+
+template<>
+inline void
+SRegOperand::set(Wavefront *w, int lane, uint64_t &val)
+{
+    DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: $s%d <- %d\n",
+            w->computeUnit->cu_id, w->simdId, w->wfSlotId, lane, regIdx, val);
+
+    assert(regIdx < w->maxSpVgprs);
+    uint32_t vgprIdx = w->remap(regIdx, sizeof(uint32_t), 1);
+    w->computeUnit->vrf[w->simdId]->write<uint32_t>(vgprIdx, val, lane);
+}
+
+class DRegOperand : public BaseRegOperand
+{
+  public:
+    static unsigned maxRegIdx;
+
+    bool
+    init(unsigned opOffset, const BrigObject *obj)
+    {
+        regOperandSize = sizeof(uint64_t);
+        registerType = Enums::RT_VECTOR;
+
+        return BaseRegOperand::init(opOffset, obj, maxRegIdx, 'd');
+    }
+
+    bool
+    init_from_vect(unsigned opOffset, const BrigObject *obj, int at)
+    {
+        regOperandSize = sizeof(uint64_t);
+        registerType = Enums::RT_VECTOR;
+
+        return BaseRegOperand::init_from_vect(opOffset, obj, at, maxRegIdx,
+                                              'd');
+    }
+
+    void
+    initWithStrOffset(unsigned strOffset, const BrigObject *obj)
+    {
+        regOperandSize = sizeof(uint64_t);
+        registerType = Enums::RT_VECTOR;
+
+        return BaseRegOperand::initWithStrOffset(strOffset, obj, maxRegIdx,
+                                                 'd');
+    }
+
+    template<typename OperandType>
+    OperandType
+    get(Wavefront *w, int lane)
+    {
+        assert(sizeof(OperandType) <= sizeof(uint64_t));
+        // TODO: this check is valid only for HSAIL
+        assert(regIdx < w->maxDpVgprs);
+        uint32_t vgprIdx = w->remap(regIdx, sizeof(OperandType), 1);
+
+        return w->computeUnit->vrf[w->simdId]->read<OperandType>(vgprIdx,lane);
+    }
+
+    template<typename OperandType>
+    void
+    set(Wavefront *w, int lane, OperandType &val)
+    {
+        DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: $d%d <- %d\n",
+                w->computeUnit->cu_id, w->simdId, w->wfSlotId, lane, regIdx,
+                val);
+
+        assert(sizeof(OperandType) <= sizeof(uint64_t));
+        // TODO: this check is valid only for HSAIL
+        assert(regIdx < w->maxDpVgprs);
+        uint32_t vgprIdx = w->remap(regIdx, sizeof(OperandType), 1);
+        w->computeUnit->vrf[w->simdId]->write<OperandType>(vgprIdx,val,lane);
+    }
+
+    std::string disassemble();
+};
+
+class CRegOperand : public BaseRegOperand
+{
+  public:
+    static unsigned maxRegIdx;
+
+    bool
+    init(unsigned opOffset, const BrigObject *obj)
+    {
+        regOperandSize = sizeof(uint8_t);
+        registerType = Enums::RT_CONDITION;
+
+        return BaseRegOperand::init(opOffset, obj, maxRegIdx, 'c');
+    }
+
+    bool
+    init_from_vect(unsigned opOffset, const BrigObject *obj, int at)
+    {
+        regOperandSize = sizeof(uint8_t);
+        registerType = Enums::RT_CONDITION;
+
+        return BaseRegOperand::init_from_vect(opOffset, obj, at, maxRegIdx,
+                                              'c');
+    }
+
+    void
+    initWithStrOffset(unsigned strOffset, const BrigObject *obj)
+    {
+        regOperandSize = sizeof(uint8_t);
+        registerType = Enums::RT_CONDITION;
+
+        return BaseRegOperand::initWithStrOffset(strOffset, obj, maxRegIdx,
+                                                 'c');
+    }
+
+    template<typename OperandType>
+    OperandType
+    get(Wavefront *w, int lane)
+    {
+        assert(regIdx < w->condRegState->numRegs());
+
+        return w->condRegState->read<OperandType>((int)regIdx, lane);
+    }
+
+    template<typename OperandType>
+    void
+    set(Wavefront *w, int lane, OperandType &val)
+    {
+        DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: $c%d <- %d\n",
+                w->computeUnit->cu_id, w->simdId, w->wfSlotId, lane, regIdx,
+                val);
+
+        assert(regIdx < w->condRegState->numRegs());
+        w->condRegState->write<OperandType>(regIdx,lane,val);
+    }
+
+    std::string disassemble();
+};
+
+template<typename T>
+class ImmOperand : public BaseOperand
+{
+  public:
+    T bits;
+
+    bool init(unsigned opOffset, const BrigObject *obj);
+    bool init_from_vect(unsigned opOffset, const BrigObject *obj, int at);
+    std::string disassemble();
+
+    template<typename OperandType>
+    OperandType
+    get()
+    {
+        assert(sizeof(OperandType) <= sizeof(T));
+
+        return *(OperandType*)&bits;
+    }
+
+    // This version of get() takes a WF* and a lane id for
+    // compatibility with the register-based get() methods.
+    template<typename OperandType>
+    OperandType
+    get(Wavefront *w, int lane)
+    {
+        return get<OperandType>();
+    }
+};
+
+template<typename T>
+bool
+ImmOperand<T>::init(unsigned opOffset, const BrigObject *obj)
+{
+    const Brig::BrigOperand *brigOp = obj->getOperand(opOffset);
+
+    switch (brigOp->kind) {
+      // this is immediate operand
+      case Brig::BRIG_KIND_OPERAND_CONSTANT_BYTES:
+        {
+            DPRINTF(GPUReg, "sizeof(T): %lu, byteCount: %d\n", sizeof(T),
+                    brigOp->byteCount);
+
+            auto cbptr = (Brig::BrigOperandConstantBytes*)brigOp;
+
+            bits = *((T*)(obj->getData(cbptr->bytes + 4)));
+
+            return true;
+        }
+        break;
+
+      case Brig::BRIG_KIND_OPERAND_WAVESIZE:
+        bits = VSZ;
+        return true;
+
+      default:
+        return false;
+    }
+}
+
+template <typename T>
+bool
+ImmOperand<T>::init_from_vect(unsigned opOffset, const BrigObject *obj, int at)
+{
+    const Brig::BrigOperand *brigOp = obj->getOperand(opOffset);
+
+    if (brigOp->kind != Brig::BRIG_KIND_OPERAND_OPERAND_LIST) {
+        return false;
+    }
+
+
+    const Brig::BrigOperandOperandList *brigVecOp =
+         (const Brig::BrigOperandOperandList *)brigOp;
+
+    unsigned *data_offset =
+        (unsigned *)obj->getData(brigVecOp->elements + 4 * (at + 1));
+
+    const Brig::BrigOperand *p =
+        (const Brig::BrigOperand *)obj->getOperand(*data_offset);
+
+    if (p->kind != Brig::BRIG_KIND_OPERAND_CONSTANT_BYTES) {
+        return false;
+    }
+
+    return init(*data_offset, obj);
+}
+template<typename T>
+std::string
+ImmOperand<T>::disassemble()
+{
+    return csprintf("0x%08x", bits);
+}
+
+template<typename RegOperand, typename T>
+class RegOrImmOperand : public BaseOperand
+{
+  private:
+    bool is_imm;
+
+  public:
+    void setImm(const bool value) { is_imm = value; }
+
+    ImmOperand<T> imm_op;
+    RegOperand reg_op;
+
+    RegOrImmOperand() { is_imm = false; }
+    void init(unsigned opOffset, const BrigObject *obj);
+    void init_from_vect(unsigned opOffset, const BrigObject *obj, int at);
+    std::string disassemble();
+
+    template<typename OperandType>
+    OperandType
+    get(Wavefront *w, int lane)
+    {
+        return is_imm ?  imm_op.template get<OperandType>() :
+                         reg_op.template get<OperandType>(w, lane);
+    }
+
+    uint32_t
+    opSize()
+    {
+        if (!is_imm) {
+            return reg_op.opSize();
+        }
+
+        return 0;
+    }
+
+    bool
+    isVectorRegister()
+    {
+        if (!is_imm) {
+            return reg_op.registerType == Enums::RT_VECTOR;
+        }
+        return false;
+    }
+
+    bool
+    isCondRegister()
+    {
+        if (!is_imm) {
+            return reg_op.registerType == Enums::RT_CONDITION;
+        }
+
+        return false;
+    }
+
+    bool
+    isScalarRegister()
+    {
+        if (!is_imm) {
+            return reg_op.registerType == Enums::RT_SCALAR;
+        }
+
+        return false;
+    }
+
+    unsigned int
+    regIndex()
+    {
+        if (!is_imm) {
+            return reg_op.regIndex();
+        }
+        return 0;
+    }
+};
+
+template<typename RegOperand, typename T>
+void
+RegOrImmOperand<RegOperand, T>::init(unsigned opOffset, const BrigObject *obj)
+{
+    is_imm = false;
+
+    if (reg_op.init(opOffset, obj)) {
+        return;
+    }
+
+    if (imm_op.init(opOffset, obj)) {
+        is_imm = true;
+        return;
+    }
+
+    fatal("RegOrImmOperand::init(): bad operand kind %d\n",
+          obj->getOperand(opOffset)->kind);
+}
+
+template<typename RegOperand, typename T>
+void
+RegOrImmOperand<RegOperand, T>::init_from_vect(unsigned opOffset,
+                                               const BrigObject *obj, int at)
+{
+    if (reg_op.init_from_vect(opOffset, obj, at)) {
+        is_imm = false;
+
+        return;
+    }
+
+    if (imm_op.init_from_vect(opOffset, obj, at)) {
+        is_imm = true;
+
+        return;
+    }
+
+    fatal("RegOrImmOperand::init(): bad operand kind %d\n",
+          obj->getOperand(opOffset)->kind);
+}
+
+template<typename RegOperand, typename T>
+std::string
+RegOrImmOperand<RegOperand, T>::disassemble()
+{
+    return is_imm ? imm_op.disassemble() : reg_op.disassemble();
+}
+
+typedef RegOrImmOperand<SRegOperand, uint32_t> SRegOrImmOperand;
+typedef RegOrImmOperand<DRegOperand, uint64_t> DRegOrImmOperand;
+typedef RegOrImmOperand<CRegOperand, bool> CRegOrImmOperand;
+
+class AddrOperandBase : public BaseOperand
+{
+  protected:
+    // helper function for init()
+    void parseAddr(const Brig::BrigOperandAddress *op, const BrigObject *obj);
+
+    // helper function for disassemble()
+    std::string disassemble(std::string reg_disassembly);
+    uint64_t calcUniformBase();
+
+  public:
+    virtual void calcVector(Wavefront *w, uint64_t *addrVec) = 0;
+    virtual uint64_t calcLane(Wavefront *w, int lane=0) = 0;
+
+    uint64_t offset;
+    const char *name = nullptr;
+    StorageElement *storageElement;
+};
+
+template<typename RegOperandType>
+class RegAddrOperand : public AddrOperandBase
+{
+  public:
+    RegOperandType reg;
+    void init(unsigned opOffset, const BrigObject *obj);
+    uint64_t calcUniform();
+    void calcVector(Wavefront *w, uint64_t *addrVec);
+    uint64_t calcLane(Wavefront *w, int lane=0);
+    uint32_t opSize() { return reg.opSize(); }
+    bool isVectorRegister() { return reg.registerType == Enums::RT_VECTOR; }
+    bool isCondRegister() { return reg.registerType == Enums::RT_CONDITION; }
+    bool isScalarRegister() { return reg.registerType == Enums::RT_SCALAR; }
+    unsigned int regIndex() { return reg.regIndex(); }
+    std::string disassemble();
+};
+
+template<typename RegOperandType>
+void
+RegAddrOperand<RegOperandType>::init(unsigned opOffset, const BrigObject *obj)
+{
+    using namespace Brig;
+
+    const BrigOperand *baseOp = obj->getOperand(opOffset);
+
+    switch (baseOp->kind) {
+      case BRIG_KIND_OPERAND_ADDRESS:
+        {
+            const BrigOperandAddress *op = (BrigOperandAddress*)baseOp;
+            storageElement = nullptr;
+
+            offset = (uint64_t(op->offset.hi) << 32) | uint64_t(op->offset.lo);
+            reg.init(op->reg, obj);
+
+            if (reg.regFileChar == 's') {
+                reg.regOperandSize = sizeof(uint32_t);
+                registerType = Enums::RT_VECTOR;
+            }
+            else if (reg.regFileChar == 'd') {
+                reg.regOperandSize = sizeof(uint64_t);
+                registerType = Enums::RT_VECTOR;
+            }
+        }
+        break;
+
+      default:
+        fatal("RegAddrOperand: bad operand kind %d\n", baseOp->kind);
+        break;
+    }
+}
+
+template<typename RegOperandType>
+uint64_t
+RegAddrOperand<RegOperandType>::calcUniform()
+{
+    fatal("can't do calcUniform() on register-based address\n");
+
+    return 0;
+}
+
+template<typename RegOperandType>
+void
+RegAddrOperand<RegOperandType>::calcVector(Wavefront *w, uint64_t *addrVec)
+{
+    Addr address = calcUniformBase();
+
+    for (int lane = 0; lane < VSZ; ++lane) {
+        if (w->execMask(lane)) {
+            if (reg.regFileChar == 's') {
+                addrVec[lane] = address + reg.template get<uint32_t>(w, lane);
+            } else {
+                addrVec[lane] = address + reg.template get<Addr>(w, lane);
+            }
+        }
+    }
+}
+
+template<typename RegOperandType>
+uint64_t
+RegAddrOperand<RegOperandType>::calcLane(Wavefront *w, int lane)
+{
+    Addr address = calcUniformBase();
+
+    return address + reg.template get<Addr>(w, lane);
+}
+
+template<typename RegOperandType>
+std::string
+RegAddrOperand<RegOperandType>::disassemble()
+{
+    return AddrOperandBase::disassemble(reg.disassemble());
+}
+
+typedef RegAddrOperand<SRegOperand> SRegAddrOperand;
+typedef RegAddrOperand<DRegOperand> DRegAddrOperand;
+
+class NoRegAddrOperand : public AddrOperandBase
+{
+  public:
+    void init(unsigned opOffset, const BrigObject *obj);
+    uint64_t calcUniform();
+    void calcVector(Wavefront *w, uint64_t *addrVec);
+    uint64_t calcLane(Wavefront *w, int lane=0);
+    std::string disassemble();
+};
+
+inline uint64_t
+NoRegAddrOperand::calcUniform()
+{
+    return AddrOperandBase::calcUniformBase();
+}
+
+inline uint64_t
+NoRegAddrOperand::calcLane(Wavefront *w, int lane)
+{
+    return calcUniform();
+}
+
+inline void
+NoRegAddrOperand::calcVector(Wavefront *w, uint64_t *addrVec)
+{
+    uint64_t address = calcUniformBase();
+
+    for (int lane = 0; lane < VSZ; ++lane)
+        addrVec[lane] = address;
+}
+
+class LabelOperand : public BaseOperand
+{
+  public:
+    Label *label;
+
+    void init(unsigned opOffset, const BrigObject *obj);
+    std::string disassemble();
+
+    // special get method for compatibility with SRegOperand
+    uint32_t getTarget(Wavefront *w, int lane);
+
+};
+
+class ListOperand : public BaseOperand
+{
+  public:
+    int elementCount;
+    std::vector<StorageElement*> callArgs;
+
+    int
+    getSrcOperand(int idx)
+    {
+        DPRINTF(GPUReg, "getSrcOperand, idx: %d, sz_args: %d\n", idx,
+                callArgs.size());
+
+        return callArgs.at(idx)->offset;
+    }
+
+    void init(unsigned opOffset, const BrigObject *obj);
+
+    std::string disassemble();
+
+    template<typename OperandType>
+    OperandType
+    get(Wavefront *w, int lane, int arg_idx)
+    {
+        return w->readCallArgMem<OperandType>(lane, getSrcOperand(arg_idx));
+    }
+
+    template<typename OperandType>
+    void
+    set(Wavefront *w, int lane, OperandType val)
+    {
+        w->writeCallArgMem<OperandType>(lane, getSrcOperand(0), val);
+        DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: arg[%d] <- %d\n",
+                w->computeUnit->cu_id, w->simdId, w->wfSlotId, lane,
+                getSrcOperand(0), val);
+    }
+};
+
+class FunctionRefOperand : public BaseOperand
+{
+  public:
+    const char *func_name;
+
+    void init(unsigned opOffset, const BrigObject *obj);
+    std::string disassemble();
+};
+
+#endif // __ARCH_HSAIL_OPERAND_HH__