summaryrefslogtreecommitdiff
path: root/src/arch/hsail/insts/pseudo_inst.cc
diff options
context:
space:
mode:
Diffstat (limited to 'src/arch/hsail/insts/pseudo_inst.cc')
-rw-r--r--src/arch/hsail/insts/pseudo_inst.cc787
1 files changed, 787 insertions, 0 deletions
diff --git a/src/arch/hsail/insts/pseudo_inst.cc b/src/arch/hsail/insts/pseudo_inst.cc
new file mode 100644
index 000000000..9506a80ab
--- /dev/null
+++ b/src/arch/hsail/insts/pseudo_inst.cc
@@ -0,0 +1,787 @@
+/*
+ * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Marc Orr
+ */
+
+#include <csignal>
+
+#include "arch/hsail/insts/decl.hh"
+#include "arch/hsail/insts/mem.hh"
+
+namespace HsailISA
+{
+ // Pseudo (or magic) instructions are overloaded on the hsail call
+ // instruction, because of its flexible parameter signature.
+
+ // To add a new magic instruction:
+ // 1. Add an entry to the enum.
+ // 2. Implement it in the switch statement below (Call::exec).
+ // 3. Add a utility function to hsa/hsail-gpu-compute/util/magicinst.h,
+ // so its easy to call from an OpenCL kernel.
+
+ // This enum should be identical to the enum in
+ // hsa/hsail-gpu-compute/util/magicinst.h
+ enum
+ {
+ MAGIC_PRINT_WF_32 = 0,
+ MAGIC_PRINT_WF_64,
+ MAGIC_PRINT_LANE,
+ MAGIC_PRINT_LANE_64,
+ MAGIC_PRINT_WF_FLOAT,
+ MAGIC_SIM_BREAK,
+ MAGIC_PREF_SUM,
+ MAGIC_REDUCTION,
+ MAGIC_MASKLANE_LOWER,
+ MAGIC_MASKLANE_UPPER,
+ MAGIC_JOIN_WF_BAR,
+ MAGIC_WAIT_WF_BAR,
+ MAGIC_PANIC,
+ MAGIC_ATOMIC_NR_ADD_GLOBAL_U32_REG,
+ MAGIC_ATOMIC_NR_ADD_GROUP_U32_REG,
+ MAGIC_LOAD_GLOBAL_U32_REG,
+ MAGIC_XACT_CAS_LD,
+ MAGIC_MOST_SIG_THD,
+ MAGIC_MOST_SIG_BROADCAST,
+ MAGIC_PRINT_WFID_32,
+ MAGIC_PRINT_WFID_64
+ };
+
+ void
+ Call::execPseudoInst(Wavefront *w, GPUDynInstPtr gpuDynInst)
+ {
+ const VectorMask &mask = w->get_pred();
+
+ int op = 0;
+ bool got_op = false;
+
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (mask[lane]) {
+ int src_val0 = src1.get<int>(w, lane, 0);
+ if (got_op) {
+ if (src_val0 != op) {
+ fatal("Multiple magic instructions per PC not "
+ "supported\n");
+ }
+ } else {
+ op = src_val0;
+ got_op = true;
+ }
+ }
+ }
+
+ switch(op) {
+ case MAGIC_PRINT_WF_32:
+ MagicPrintWF32(w);
+ break;
+ case MAGIC_PRINT_WF_64:
+ MagicPrintWF64(w);
+ break;
+ case MAGIC_PRINT_LANE:
+ MagicPrintLane(w);
+ break;
+ case MAGIC_PRINT_LANE_64:
+ MagicPrintLane64(w);
+ break;
+ case MAGIC_PRINT_WF_FLOAT:
+ MagicPrintWFFloat(w);
+ break;
+ case MAGIC_SIM_BREAK:
+ MagicSimBreak(w);
+ break;
+ case MAGIC_PREF_SUM:
+ MagicPrefixSum(w);
+ break;
+ case MAGIC_REDUCTION:
+ MagicReduction(w);
+ break;
+ case MAGIC_MASKLANE_LOWER:
+ MagicMaskLower(w);
+ break;
+ case MAGIC_MASKLANE_UPPER:
+ MagicMaskUpper(w);
+ break;
+ case MAGIC_JOIN_WF_BAR:
+ MagicJoinWFBar(w);
+ break;
+ case MAGIC_WAIT_WF_BAR:
+ MagicWaitWFBar(w);
+ break;
+ case MAGIC_PANIC:
+ MagicPanic(w);
+ break;
+
+ // atomic instructions
+ case MAGIC_ATOMIC_NR_ADD_GLOBAL_U32_REG:
+ MagicAtomicNRAddGlobalU32Reg(w, gpuDynInst);
+ break;
+
+ case MAGIC_ATOMIC_NR_ADD_GROUP_U32_REG:
+ MagicAtomicNRAddGroupU32Reg(w, gpuDynInst);
+ break;
+
+ case MAGIC_LOAD_GLOBAL_U32_REG:
+ MagicLoadGlobalU32Reg(w, gpuDynInst);
+ break;
+
+ case MAGIC_XACT_CAS_LD:
+ MagicXactCasLd(w);
+ break;
+
+ case MAGIC_MOST_SIG_THD:
+ MagicMostSigThread(w);
+ break;
+
+ case MAGIC_MOST_SIG_BROADCAST:
+ MagicMostSigBroadcast(w);
+ break;
+
+ case MAGIC_PRINT_WFID_32:
+ MagicPrintWF32ID(w);
+ break;
+
+ case MAGIC_PRINT_WFID_64:
+ MagicPrintWFID64(w);
+ break;
+
+ default: fatal("unrecognized magic instruction: %d\n", op);
+ }
+ }
+
+ void
+ Call::MagicPrintLane(Wavefront *w)
+ {
+ #if TRACING_ON
+ const VectorMask &mask = w->get_pred();
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (mask[lane]) {
+ int src_val1 = src1.get<int>(w, lane, 1);
+ int src_val2 = src1.get<int>(w, lane, 2);
+ if (src_val2) {
+ DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: 0x%x\n",
+ disassemble(), w->computeUnit->cu_id, w->simdId,
+ w->wfSlotId, lane, src_val1);
+ } else {
+ DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: %d\n",
+ disassemble(), w->computeUnit->cu_id, w->simdId,
+ w->wfSlotId, lane, src_val1);
+ }
+ }
+ }
+ #endif
+ }
+
+ void
+ Call::MagicPrintLane64(Wavefront *w)
+ {
+ #if TRACING_ON
+ const VectorMask &mask = w->get_pred();
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (mask[lane]) {
+ int64_t src_val1 = src1.get<int64_t>(w, lane, 1);
+ int src_val2 = src1.get<int>(w, lane, 2);
+ if (src_val2) {
+ DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: 0x%x\n",
+ disassemble(), w->computeUnit->cu_id, w->simdId,
+ w->wfSlotId, lane, src_val1);
+ } else {
+ DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: %d\n",
+ disassemble(), w->computeUnit->cu_id, w->simdId,
+ w->wfSlotId, lane, src_val1);
+ }
+ }
+ }
+ #endif
+ }
+
+ void
+ Call::MagicPrintWF32(Wavefront *w)
+ {
+ #if TRACING_ON
+ const VectorMask &mask = w->get_pred();
+ std::string res_str;
+ res_str = csprintf("krl_prt (%s)\n", disassemble());
+
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (!(lane & 7)) {
+ res_str += csprintf("DB%03d: ", (int)w->wfDynId);
+ }
+
+ if (mask[lane]) {
+ int src_val1 = src1.get<int>(w, lane, 1);
+ int src_val2 = src1.get<int>(w, lane, 2);
+
+ if (src_val2) {
+ res_str += csprintf("%08x", src_val1);
+ } else {
+ res_str += csprintf("%08d", src_val1);
+ }
+ } else {
+ res_str += csprintf("xxxxxxxx");
+ }
+
+ if ((lane & 7) == 7) {
+ res_str += csprintf("\n");
+ } else {
+ res_str += csprintf(" ");
+ }
+ }
+
+ res_str += "\n\n";
+ DPRINTFN(res_str.c_str());
+ #endif
+ }
+
+ void
+ Call::MagicPrintWF32ID(Wavefront *w)
+ {
+ #if TRACING_ON
+ const VectorMask &mask = w->get_pred();
+ std::string res_str;
+ int src_val3 = -1;
+ res_str = csprintf("krl_prt (%s)\n", disassemble());
+
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (!(lane & 7)) {
+ res_str += csprintf("DB%03d: ", (int)w->wfDynId);
+ }
+
+ if (mask[lane]) {
+ int src_val1 = src1.get<int>(w, lane, 1);
+ int src_val2 = src1.get<int>(w, lane, 2);
+ src_val3 = src1.get<int>(w, lane, 3);
+
+ if (src_val2) {
+ res_str += csprintf("%08x", src_val1);
+ } else {
+ res_str += csprintf("%08d", src_val1);
+ }
+ } else {
+ res_str += csprintf("xxxxxxxx");
+ }
+
+ if ((lane & 7) == 7) {
+ res_str += csprintf("\n");
+ } else {
+ res_str += csprintf(" ");
+ }
+ }
+
+ res_str += "\n\n";
+ if (w->wfDynId == src_val3) {
+ DPRINTFN(res_str.c_str());
+ }
+ #endif
+ }
+
+ void
+ Call::MagicPrintWF64(Wavefront *w)
+ {
+ #if TRACING_ON
+ const VectorMask &mask = w->get_pred();
+ std::string res_str;
+ res_str = csprintf("krl_prt (%s)\n", disassemble());
+
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (!(lane & 3)) {
+ res_str += csprintf("DB%03d: ", (int)w->wfDynId);
+ }
+
+ if (mask[lane]) {
+ int64_t src_val1 = src1.get<int64_t>(w, lane, 1);
+ int src_val2 = src1.get<int>(w, lane, 2);
+
+ if (src_val2) {
+ res_str += csprintf("%016x", src_val1);
+ } else {
+ res_str += csprintf("%016d", src_val1);
+ }
+ } else {
+ res_str += csprintf("xxxxxxxxxxxxxxxx");
+ }
+
+ if ((lane & 3) == 3) {
+ res_str += csprintf("\n");
+ } else {
+ res_str += csprintf(" ");
+ }
+ }
+
+ res_str += "\n\n";
+ DPRINTFN(res_str.c_str());
+ #endif
+ }
+
+ void
+ Call::MagicPrintWFID64(Wavefront *w)
+ {
+ #if TRACING_ON
+ const VectorMask &mask = w->get_pred();
+ std::string res_str;
+ int src_val3 = -1;
+ res_str = csprintf("krl_prt (%s)\n", disassemble());
+
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (!(lane & 3)) {
+ res_str += csprintf("DB%03d: ", (int)w->wfDynId);
+ }
+
+ if (mask[lane]) {
+ int64_t src_val1 = src1.get<int64_t>(w, lane, 1);
+ int src_val2 = src1.get<int>(w, lane, 2);
+ src_val3 = src1.get<int>(w, lane, 3);
+
+ if (src_val2) {
+ res_str += csprintf("%016x", src_val1);
+ } else {
+ res_str += csprintf("%016d", src_val1);
+ }
+ } else {
+ res_str += csprintf("xxxxxxxxxxxxxxxx");
+ }
+
+ if ((lane & 3) == 3) {
+ res_str += csprintf("\n");
+ } else {
+ res_str += csprintf(" ");
+ }
+ }
+
+ res_str += "\n\n";
+ if (w->wfDynId == src_val3) {
+ DPRINTFN(res_str.c_str());
+ }
+ #endif
+ }
+
+ void
+ Call::MagicPrintWFFloat(Wavefront *w)
+ {
+ #if TRACING_ON
+ const VectorMask &mask = w->get_pred();
+ std::string res_str;
+ res_str = csprintf("krl_prt (%s)\n", disassemble());
+
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (!(lane & 7)) {
+ res_str += csprintf("DB%03d: ", (int)w->wfDynId);
+ }
+
+ if (mask[lane]) {
+ float src_val1 = src1.get<float>(w, lane, 1);
+ res_str += csprintf("%08f", src_val1);
+ } else {
+ res_str += csprintf("xxxxxxxx");
+ }
+
+ if ((lane & 7) == 7) {
+ res_str += csprintf("\n");
+ } else {
+ res_str += csprintf(" ");
+ }
+ }
+
+ res_str += "\n\n";
+ DPRINTFN(res_str.c_str());
+ #endif
+ }
+
+ // raises a signal that GDB will catch
+ // when done with the break, type "signal 0" in gdb to continue
+ void
+ Call::MagicSimBreak(Wavefront *w)
+ {
+ std::string res_str;
+ // print out state for this wavefront and then break
+ res_str = csprintf("Breakpoint encountered for wavefront %i\n",
+ w->wfSlotId);
+
+ res_str += csprintf(" Kern ID: %i\n", w->kern_id);
+ res_str += csprintf(" Phase ID: %i\n", w->simdId);
+ res_str += csprintf(" Executing on CU #%i\n", w->computeUnit->cu_id);
+ res_str += csprintf(" Exec mask: ");
+
+ for (int i = VSZ - 1; i >= 0; --i) {
+ if (w->execMask(i))
+ res_str += "1";
+ else
+ res_str += "0";
+
+ if ((i & 7) == 7)
+ res_str += " ";
+ }
+
+ res_str += csprintf("(0x%016llx)\n", w->execMask().to_ullong());
+
+ res_str += "\nHelpful debugging hints:\n";
+ res_str += " Check out w->s_reg / w->d_reg for register state\n";
+
+ res_str += "\n\n";
+ DPRINTFN(res_str.c_str());
+ fflush(stdout);
+
+ raise(SIGTRAP);
+ }
+
+ void
+ Call::MagicPrefixSum(Wavefront *w)
+ {
+ const VectorMask &mask = w->get_pred();
+ int res = 0;
+
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (mask[lane]) {
+ int src_val1 = src1.get<int>(w, lane, 1);
+ dest.set<int>(w, lane, res);
+ res += src_val1;
+ }
+ }
+ }
+
+ void
+ Call::MagicReduction(Wavefront *w)
+ {
+ // reduction magic instruction
+ // The reduction instruction takes up to 64 inputs (one from
+ // each thread in a WF) and sums them. It returns the sum to
+ // each thread in the WF.
+ const VectorMask &mask = w->get_pred();
+ int res = 0;
+
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (mask[lane]) {
+ int src_val1 = src1.get<int>(w, lane, 1);
+ res += src_val1;
+ }
+ }
+
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (mask[lane]) {
+ dest.set<int>(w, lane, res);
+ }
+ }
+ }
+
+ void
+ Call::MagicMaskLower(Wavefront *w)
+ {
+ const VectorMask &mask = w->get_pred();
+ int res = 0;
+
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (mask[lane]) {
+ int src_val1 = src1.get<int>(w, lane, 1);
+
+ if (src_val1) {
+ if (lane < (VSZ/2)) {
+ res = res | ((uint32_t)(1) << lane);
+ }
+ }
+ }
+ }
+
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (mask[lane]) {
+ dest.set<int>(w, lane, res);
+ }
+ }
+ }
+
+ void
+ Call::MagicMaskUpper(Wavefront *w)
+ {
+ const VectorMask &mask = w->get_pred();
+ int res = 0;
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (mask[lane]) {
+ int src_val1 = src1.get<int>(w, lane, 1);
+
+ if (src_val1) {
+ if (lane >= (VSZ/2)) {
+ res = res | ((uint32_t)(1) << (lane - (VSZ/2)));
+ }
+ }
+ }
+ }
+
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (mask[lane]) {
+ dest.set<int>(w, lane, res);
+ }
+ }
+ }
+
+ void
+ Call::MagicJoinWFBar(Wavefront *w)
+ {
+ const VectorMask &mask = w->get_pred();
+ int max_cnt = 0;
+
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (mask[lane]) {
+ w->bar_cnt[lane]++;
+
+ if (w->bar_cnt[lane] > max_cnt) {
+ max_cnt = w->bar_cnt[lane];
+ }
+ }
+ }
+
+ if (max_cnt > w->max_bar_cnt) {
+ w->max_bar_cnt = max_cnt;
+ }
+ }
+
+ void
+ Call::MagicWaitWFBar(Wavefront *w)
+ {
+ const VectorMask &mask = w->get_pred();
+ int max_cnt = 0;
+
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (mask[lane]) {
+ w->bar_cnt[lane]--;
+ }
+
+ if (w->bar_cnt[lane] > max_cnt) {
+ max_cnt = w->bar_cnt[lane];
+ }
+ }
+
+ if (max_cnt < w->max_bar_cnt) {
+ w->max_bar_cnt = max_cnt;
+ }
+
+ w->instructionBuffer.erase(w->instructionBuffer.begin() + 1,
+ w->instructionBuffer.end());
+ if (w->pendingFetch)
+ w->dropFetch = true;
+ }
+
+ void
+ Call::MagicPanic(Wavefront *w)
+ {
+ const VectorMask &mask = w->get_pred();
+
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (mask[lane]) {
+ int src_val1 = src1.get<int>(w, lane, 1);
+ panic("OpenCL Code failed assertion #%d. Triggered by lane %s",
+ src_val1, lane);
+ }
+ }
+ }
+
+ void
+ Call::calcAddr(Wavefront *w, GPUDynInstPtr m)
+ {
+ // the address is in src1 | src2
+ for (int lane = 0; lane < VSZ; ++lane) {
+ int src_val1 = src1.get<int>(w, lane, 1);
+ int src_val2 = src1.get<int>(w, lane, 2);
+ Addr addr = (((Addr) src_val1) << 32) | ((Addr) src_val2);
+
+ m->addr[lane] = addr;
+ }
+
+ }
+
+ void
+ Call::MagicAtomicNRAddGlobalU32Reg(Wavefront *w, GPUDynInstPtr gpuDynInst)
+ {
+ GPUDynInstPtr m = gpuDynInst;
+
+ calcAddr(w, m);
+
+ for (int lane = 0; lane < VSZ; ++lane) {
+ ((int*)m->a_data)[lane] = src1.get<int>(w, lane, 3);
+ }
+
+ m->m_op = brigAtomicToMemOpType(Brig::BRIG_OPCODE_ATOMICNORET,
+ Brig::BRIG_ATOMIC_ADD);
+ m->m_type = U32::memType;
+ m->v_type = U32::vgprType;
+
+ m->exec_mask = w->execMask();
+ m->statusBitVector = 0;
+ m->equiv = 0; // atomics don't have an equivalence class operand
+ m->n_reg = 1;
+ m->memoryOrder = Enums::MEMORY_ORDER_NONE;
+ m->scope = Enums::MEMORY_SCOPE_NONE;
+
+ m->simdId = w->simdId;
+ m->wfSlotId = w->wfSlotId;
+ m->wfDynId = w->wfDynId;
+ m->latency.init(&w->computeUnit->shader->tick_cnt);
+
+ m->s_type = SEG_GLOBAL;
+ m->pipeId = GLBMEM_PIPE;
+ m->latency.set(w->computeUnit->shader->ticks(64));
+ w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+ w->outstanding_reqs_wr_gm++;
+ w->wr_gm_reqs_in_pipe--;
+ w->outstanding_reqs_rd_gm++;
+ w->rd_gm_reqs_in_pipe--;
+ w->outstanding_reqs++;
+ w->mem_reqs_in_pipe--;
+ }
+
+ void
+ Call::MagicAtomicNRAddGroupU32Reg(Wavefront *w, GPUDynInstPtr gpuDynInst)
+ {
+ GPUDynInstPtr m = gpuDynInst;
+ calcAddr(w, m);
+
+ for (int lane = 0; lane < VSZ; ++lane) {
+ ((int*)m->a_data)[lane] = src1.get<int>(w, lane, 1);
+ }
+
+ m->m_op = brigAtomicToMemOpType(Brig::BRIG_OPCODE_ATOMICNORET,
+ Brig::BRIG_ATOMIC_ADD);
+ m->m_type = U32::memType;
+ m->v_type = U32::vgprType;
+
+ m->exec_mask = w->execMask();
+ m->statusBitVector = 0;
+ m->equiv = 0; // atomics don't have an equivalence class operand
+ m->n_reg = 1;
+ m->memoryOrder = Enums::MEMORY_ORDER_NONE;
+ m->scope = Enums::MEMORY_SCOPE_NONE;
+
+ m->simdId = w->simdId;
+ m->wfSlotId = w->wfSlotId;
+ m->wfDynId = w->wfDynId;
+ m->latency.init(&w->computeUnit->shader->tick_cnt);
+
+ m->s_type = SEG_GLOBAL;
+ m->pipeId = GLBMEM_PIPE;
+ m->latency.set(w->computeUnit->shader->ticks(64));
+ w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+ w->outstanding_reqs_wr_gm++;
+ w->wr_gm_reqs_in_pipe--;
+ w->outstanding_reqs_rd_gm++;
+ w->rd_gm_reqs_in_pipe--;
+ w->outstanding_reqs++;
+ w->mem_reqs_in_pipe--;
+ }
+
+ void
+ Call::MagicLoadGlobalU32Reg(Wavefront *w, GPUDynInstPtr gpuDynInst)
+ {
+ GPUDynInstPtr m = gpuDynInst;
+ // calculate the address
+ calcAddr(w, m);
+
+ m->m_op = Enums::MO_LD;
+ m->m_type = U32::memType; //MemDataType::memType;
+ m->v_type = U32::vgprType; //DestDataType::vgprType;
+
+ m->exec_mask = w->execMask();
+ m->statusBitVector = 0;
+ m->equiv = 0;
+ m->n_reg = 1;
+ m->memoryOrder = Enums::MEMORY_ORDER_NONE;
+ m->scope = Enums::MEMORY_SCOPE_NONE;
+
+ // FIXME
+ //m->dst_reg = this->dest.regIndex();
+
+ m->simdId = w->simdId;
+ m->wfSlotId = w->wfSlotId;
+ m->wfDynId = w->wfDynId;
+ m->latency.init(&w->computeUnit->shader->tick_cnt);
+
+ m->s_type = SEG_GLOBAL;
+ m->pipeId = GLBMEM_PIPE;
+ m->latency.set(w->computeUnit->shader->ticks(1));
+ w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+ w->outstanding_reqs_rd_gm++;
+ w->rd_gm_reqs_in_pipe--;
+ w->outstanding_reqs++;
+ w->mem_reqs_in_pipe--;
+ }
+
+ void
+ Call::MagicXactCasLd(Wavefront *w)
+ {
+ const VectorMask &mask = w->get_pred();
+ int src_val1 = 0;
+
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (mask[lane]) {
+ src_val1 = src1.get<int>(w, lane, 1);
+ break;
+ }
+ }
+
+ if (!w->computeUnit->xactCasLoadMap.count(src_val1)) {
+ w->computeUnit->xactCasLoadMap[src_val1] = ComputeUnit::waveQueue();
+ w->computeUnit->xactCasLoadMap[src_val1].waveIDQueue.clear();
+ }
+
+ w->computeUnit->xactCasLoadMap[src_val1].waveIDQueue
+ .push_back(ComputeUnit::waveIdentifier(w->simdId, w->wfSlotId));
+ }
+
+ void
+ Call::MagicMostSigThread(Wavefront *w)
+ {
+ const VectorMask &mask = w->get_pred();
+ unsigned mst = true;
+
+ for (int lane = VSZ - 1; lane >= 0; --lane) {
+ if (mask[lane]) {
+ dest.set<int>(w, lane, mst);
+ mst = false;
+ }
+ }
+ }
+
+ void
+ Call::MagicMostSigBroadcast(Wavefront *w)
+ {
+ const VectorMask &mask = w->get_pred();
+ int res = 0;
+ bool got_res = false;
+
+ for (int lane = VSZ - 1; lane >= 0; --lane) {
+ if (mask[lane]) {
+ if (!got_res) {
+ res = src1.get<int>(w, lane, 1);
+ got_res = true;
+ }
+ dest.set<int>(w, lane, res);
+ }
+ }
+ }
+
+} // namespace HsailISA