diff options
Diffstat (limited to 'src/arch/hsail/insts/pseudo_inst.cc')
-rw-r--r-- | src/arch/hsail/insts/pseudo_inst.cc | 787 |
1 files changed, 787 insertions, 0 deletions
diff --git a/src/arch/hsail/insts/pseudo_inst.cc b/src/arch/hsail/insts/pseudo_inst.cc new file mode 100644 index 000000000..9506a80ab --- /dev/null +++ b/src/arch/hsail/insts/pseudo_inst.cc @@ -0,0 +1,787 @@ +/* + * Copyright (c) 2013-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Marc Orr + */ + +#include <csignal> + +#include "arch/hsail/insts/decl.hh" +#include "arch/hsail/insts/mem.hh" + +namespace HsailISA +{ + // Pseudo (or magic) instructions are overloaded on the hsail call + // instruction, because of its flexible parameter signature. + + // To add a new magic instruction: + // 1. Add an entry to the enum. + // 2. Implement it in the switch statement below (Call::exec). + // 3. Add a utility function to hsa/hsail-gpu-compute/util/magicinst.h, + // so its easy to call from an OpenCL kernel. + + // This enum should be identical to the enum in + // hsa/hsail-gpu-compute/util/magicinst.h + enum + { + MAGIC_PRINT_WF_32 = 0, + MAGIC_PRINT_WF_64, + MAGIC_PRINT_LANE, + MAGIC_PRINT_LANE_64, + MAGIC_PRINT_WF_FLOAT, + MAGIC_SIM_BREAK, + MAGIC_PREF_SUM, + MAGIC_REDUCTION, + MAGIC_MASKLANE_LOWER, + MAGIC_MASKLANE_UPPER, + MAGIC_JOIN_WF_BAR, + MAGIC_WAIT_WF_BAR, + MAGIC_PANIC, + MAGIC_ATOMIC_NR_ADD_GLOBAL_U32_REG, + MAGIC_ATOMIC_NR_ADD_GROUP_U32_REG, + MAGIC_LOAD_GLOBAL_U32_REG, + MAGIC_XACT_CAS_LD, + MAGIC_MOST_SIG_THD, + MAGIC_MOST_SIG_BROADCAST, + MAGIC_PRINT_WFID_32, + MAGIC_PRINT_WFID_64 + }; + + void + Call::execPseudoInst(Wavefront *w, GPUDynInstPtr gpuDynInst) + { + const VectorMask &mask = w->get_pred(); + + int op = 0; + bool got_op = false; + + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + int src_val0 = src1.get<int>(w, lane, 0); + if (got_op) { + if (src_val0 != op) { + fatal("Multiple magic instructions per PC not " + "supported\n"); + } + } else { + op = src_val0; + got_op = true; + } + } + } + + switch(op) { + case MAGIC_PRINT_WF_32: + MagicPrintWF32(w); + break; + case MAGIC_PRINT_WF_64: + MagicPrintWF64(w); + break; + case MAGIC_PRINT_LANE: + MagicPrintLane(w); + break; + case MAGIC_PRINT_LANE_64: + MagicPrintLane64(w); + break; + case MAGIC_PRINT_WF_FLOAT: + MagicPrintWFFloat(w); + break; + case MAGIC_SIM_BREAK: + MagicSimBreak(w); + break; + case MAGIC_PREF_SUM: + MagicPrefixSum(w); + break; + case MAGIC_REDUCTION: + MagicReduction(w); + break; + case MAGIC_MASKLANE_LOWER: + MagicMaskLower(w); + break; + case MAGIC_MASKLANE_UPPER: + MagicMaskUpper(w); + break; + case MAGIC_JOIN_WF_BAR: + MagicJoinWFBar(w); + break; + case MAGIC_WAIT_WF_BAR: + MagicWaitWFBar(w); + break; + case MAGIC_PANIC: + MagicPanic(w); + break; + + // atomic instructions + case MAGIC_ATOMIC_NR_ADD_GLOBAL_U32_REG: + MagicAtomicNRAddGlobalU32Reg(w, gpuDynInst); + break; + + case MAGIC_ATOMIC_NR_ADD_GROUP_U32_REG: + MagicAtomicNRAddGroupU32Reg(w, gpuDynInst); + break; + + case MAGIC_LOAD_GLOBAL_U32_REG: + MagicLoadGlobalU32Reg(w, gpuDynInst); + break; + + case MAGIC_XACT_CAS_LD: + MagicXactCasLd(w); + break; + + case MAGIC_MOST_SIG_THD: + MagicMostSigThread(w); + break; + + case MAGIC_MOST_SIG_BROADCAST: + MagicMostSigBroadcast(w); + break; + + case MAGIC_PRINT_WFID_32: + MagicPrintWF32ID(w); + break; + + case MAGIC_PRINT_WFID_64: + MagicPrintWFID64(w); + break; + + default: fatal("unrecognized magic instruction: %d\n", op); + } + } + + void + Call::MagicPrintLane(Wavefront *w) + { + #if TRACING_ON + const VectorMask &mask = w->get_pred(); + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + int src_val1 = src1.get<int>(w, lane, 1); + int src_val2 = src1.get<int>(w, lane, 2); + if (src_val2) { + DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: 0x%x\n", + disassemble(), w->computeUnit->cu_id, w->simdId, + w->wfSlotId, lane, src_val1); + } else { + DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: %d\n", + disassemble(), w->computeUnit->cu_id, w->simdId, + w->wfSlotId, lane, src_val1); + } + } + } + #endif + } + + void + Call::MagicPrintLane64(Wavefront *w) + { + #if TRACING_ON + const VectorMask &mask = w->get_pred(); + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + int64_t src_val1 = src1.get<int64_t>(w, lane, 1); + int src_val2 = src1.get<int>(w, lane, 2); + if (src_val2) { + DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: 0x%x\n", + disassemble(), w->computeUnit->cu_id, w->simdId, + w->wfSlotId, lane, src_val1); + } else { + DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: %d\n", + disassemble(), w->computeUnit->cu_id, w->simdId, + w->wfSlotId, lane, src_val1); + } + } + } + #endif + } + + void + Call::MagicPrintWF32(Wavefront *w) + { + #if TRACING_ON + const VectorMask &mask = w->get_pred(); + std::string res_str; + res_str = csprintf("krl_prt (%s)\n", disassemble()); + + for (int lane = 0; lane < VSZ; ++lane) { + if (!(lane & 7)) { + res_str += csprintf("DB%03d: ", (int)w->wfDynId); + } + + if (mask[lane]) { + int src_val1 = src1.get<int>(w, lane, 1); + int src_val2 = src1.get<int>(w, lane, 2); + + if (src_val2) { + res_str += csprintf("%08x", src_val1); + } else { + res_str += csprintf("%08d", src_val1); + } + } else { + res_str += csprintf("xxxxxxxx"); + } + + if ((lane & 7) == 7) { + res_str += csprintf("\n"); + } else { + res_str += csprintf(" "); + } + } + + res_str += "\n\n"; + DPRINTFN(res_str.c_str()); + #endif + } + + void + Call::MagicPrintWF32ID(Wavefront *w) + { + #if TRACING_ON + const VectorMask &mask = w->get_pred(); + std::string res_str; + int src_val3 = -1; + res_str = csprintf("krl_prt (%s)\n", disassemble()); + + for (int lane = 0; lane < VSZ; ++lane) { + if (!(lane & 7)) { + res_str += csprintf("DB%03d: ", (int)w->wfDynId); + } + + if (mask[lane]) { + int src_val1 = src1.get<int>(w, lane, 1); + int src_val2 = src1.get<int>(w, lane, 2); + src_val3 = src1.get<int>(w, lane, 3); + + if (src_val2) { + res_str += csprintf("%08x", src_val1); + } else { + res_str += csprintf("%08d", src_val1); + } + } else { + res_str += csprintf("xxxxxxxx"); + } + + if ((lane & 7) == 7) { + res_str += csprintf("\n"); + } else { + res_str += csprintf(" "); + } + } + + res_str += "\n\n"; + if (w->wfDynId == src_val3) { + DPRINTFN(res_str.c_str()); + } + #endif + } + + void + Call::MagicPrintWF64(Wavefront *w) + { + #if TRACING_ON + const VectorMask &mask = w->get_pred(); + std::string res_str; + res_str = csprintf("krl_prt (%s)\n", disassemble()); + + for (int lane = 0; lane < VSZ; ++lane) { + if (!(lane & 3)) { + res_str += csprintf("DB%03d: ", (int)w->wfDynId); + } + + if (mask[lane]) { + int64_t src_val1 = src1.get<int64_t>(w, lane, 1); + int src_val2 = src1.get<int>(w, lane, 2); + + if (src_val2) { + res_str += csprintf("%016x", src_val1); + } else { + res_str += csprintf("%016d", src_val1); + } + } else { + res_str += csprintf("xxxxxxxxxxxxxxxx"); + } + + if ((lane & 3) == 3) { + res_str += csprintf("\n"); + } else { + res_str += csprintf(" "); + } + } + + res_str += "\n\n"; + DPRINTFN(res_str.c_str()); + #endif + } + + void + Call::MagicPrintWFID64(Wavefront *w) + { + #if TRACING_ON + const VectorMask &mask = w->get_pred(); + std::string res_str; + int src_val3 = -1; + res_str = csprintf("krl_prt (%s)\n", disassemble()); + + for (int lane = 0; lane < VSZ; ++lane) { + if (!(lane & 3)) { + res_str += csprintf("DB%03d: ", (int)w->wfDynId); + } + + if (mask[lane]) { + int64_t src_val1 = src1.get<int64_t>(w, lane, 1); + int src_val2 = src1.get<int>(w, lane, 2); + src_val3 = src1.get<int>(w, lane, 3); + + if (src_val2) { + res_str += csprintf("%016x", src_val1); + } else { + res_str += csprintf("%016d", src_val1); + } + } else { + res_str += csprintf("xxxxxxxxxxxxxxxx"); + } + + if ((lane & 3) == 3) { + res_str += csprintf("\n"); + } else { + res_str += csprintf(" "); + } + } + + res_str += "\n\n"; + if (w->wfDynId == src_val3) { + DPRINTFN(res_str.c_str()); + } + #endif + } + + void + Call::MagicPrintWFFloat(Wavefront *w) + { + #if TRACING_ON + const VectorMask &mask = w->get_pred(); + std::string res_str; + res_str = csprintf("krl_prt (%s)\n", disassemble()); + + for (int lane = 0; lane < VSZ; ++lane) { + if (!(lane & 7)) { + res_str += csprintf("DB%03d: ", (int)w->wfDynId); + } + + if (mask[lane]) { + float src_val1 = src1.get<float>(w, lane, 1); + res_str += csprintf("%08f", src_val1); + } else { + res_str += csprintf("xxxxxxxx"); + } + + if ((lane & 7) == 7) { + res_str += csprintf("\n"); + } else { + res_str += csprintf(" "); + } + } + + res_str += "\n\n"; + DPRINTFN(res_str.c_str()); + #endif + } + + // raises a signal that GDB will catch + // when done with the break, type "signal 0" in gdb to continue + void + Call::MagicSimBreak(Wavefront *w) + { + std::string res_str; + // print out state for this wavefront and then break + res_str = csprintf("Breakpoint encountered for wavefront %i\n", + w->wfSlotId); + + res_str += csprintf(" Kern ID: %i\n", w->kern_id); + res_str += csprintf(" Phase ID: %i\n", w->simdId); + res_str += csprintf(" Executing on CU #%i\n", w->computeUnit->cu_id); + res_str += csprintf(" Exec mask: "); + + for (int i = VSZ - 1; i >= 0; --i) { + if (w->execMask(i)) + res_str += "1"; + else + res_str += "0"; + + if ((i & 7) == 7) + res_str += " "; + } + + res_str += csprintf("(0x%016llx)\n", w->execMask().to_ullong()); + + res_str += "\nHelpful debugging hints:\n"; + res_str += " Check out w->s_reg / w->d_reg for register state\n"; + + res_str += "\n\n"; + DPRINTFN(res_str.c_str()); + fflush(stdout); + + raise(SIGTRAP); + } + + void + Call::MagicPrefixSum(Wavefront *w) + { + const VectorMask &mask = w->get_pred(); + int res = 0; + + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + int src_val1 = src1.get<int>(w, lane, 1); + dest.set<int>(w, lane, res); + res += src_val1; + } + } + } + + void + Call::MagicReduction(Wavefront *w) + { + // reduction magic instruction + // The reduction instruction takes up to 64 inputs (one from + // each thread in a WF) and sums them. It returns the sum to + // each thread in the WF. + const VectorMask &mask = w->get_pred(); + int res = 0; + + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + int src_val1 = src1.get<int>(w, lane, 1); + res += src_val1; + } + } + + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + dest.set<int>(w, lane, res); + } + } + } + + void + Call::MagicMaskLower(Wavefront *w) + { + const VectorMask &mask = w->get_pred(); + int res = 0; + + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + int src_val1 = src1.get<int>(w, lane, 1); + + if (src_val1) { + if (lane < (VSZ/2)) { + res = res | ((uint32_t)(1) << lane); + } + } + } + } + + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + dest.set<int>(w, lane, res); + } + } + } + + void + Call::MagicMaskUpper(Wavefront *w) + { + const VectorMask &mask = w->get_pred(); + int res = 0; + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + int src_val1 = src1.get<int>(w, lane, 1); + + if (src_val1) { + if (lane >= (VSZ/2)) { + res = res | ((uint32_t)(1) << (lane - (VSZ/2))); + } + } + } + } + + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + dest.set<int>(w, lane, res); + } + } + } + + void + Call::MagicJoinWFBar(Wavefront *w) + { + const VectorMask &mask = w->get_pred(); + int max_cnt = 0; + + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + w->bar_cnt[lane]++; + + if (w->bar_cnt[lane] > max_cnt) { + max_cnt = w->bar_cnt[lane]; + } + } + } + + if (max_cnt > w->max_bar_cnt) { + w->max_bar_cnt = max_cnt; + } + } + + void + Call::MagicWaitWFBar(Wavefront *w) + { + const VectorMask &mask = w->get_pred(); + int max_cnt = 0; + + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + w->bar_cnt[lane]--; + } + + if (w->bar_cnt[lane] > max_cnt) { + max_cnt = w->bar_cnt[lane]; + } + } + + if (max_cnt < w->max_bar_cnt) { + w->max_bar_cnt = max_cnt; + } + + w->instructionBuffer.erase(w->instructionBuffer.begin() + 1, + w->instructionBuffer.end()); + if (w->pendingFetch) + w->dropFetch = true; + } + + void + Call::MagicPanic(Wavefront *w) + { + const VectorMask &mask = w->get_pred(); + + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + int src_val1 = src1.get<int>(w, lane, 1); + panic("OpenCL Code failed assertion #%d. Triggered by lane %s", + src_val1, lane); + } + } + } + + void + Call::calcAddr(Wavefront *w, GPUDynInstPtr m) + { + // the address is in src1 | src2 + for (int lane = 0; lane < VSZ; ++lane) { + int src_val1 = src1.get<int>(w, lane, 1); + int src_val2 = src1.get<int>(w, lane, 2); + Addr addr = (((Addr) src_val1) << 32) | ((Addr) src_val2); + + m->addr[lane] = addr; + } + + } + + void + Call::MagicAtomicNRAddGlobalU32Reg(Wavefront *w, GPUDynInstPtr gpuDynInst) + { + GPUDynInstPtr m = gpuDynInst; + + calcAddr(w, m); + + for (int lane = 0; lane < VSZ; ++lane) { + ((int*)m->a_data)[lane] = src1.get<int>(w, lane, 3); + } + + m->m_op = brigAtomicToMemOpType(Brig::BRIG_OPCODE_ATOMICNORET, + Brig::BRIG_ATOMIC_ADD); + m->m_type = U32::memType; + m->v_type = U32::vgprType; + + m->exec_mask = w->execMask(); + m->statusBitVector = 0; + m->equiv = 0; // atomics don't have an equivalence class operand + m->n_reg = 1; + m->memoryOrder = Enums::MEMORY_ORDER_NONE; + m->scope = Enums::MEMORY_SCOPE_NONE; + + m->simdId = w->simdId; + m->wfSlotId = w->wfSlotId; + m->wfDynId = w->wfDynId; + m->latency.init(&w->computeUnit->shader->tick_cnt); + + m->s_type = SEG_GLOBAL; + m->pipeId = GLBMEM_PIPE; + m->latency.set(w->computeUnit->shader->ticks(64)); + w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m); + w->outstanding_reqs_wr_gm++; + w->wr_gm_reqs_in_pipe--; + w->outstanding_reqs_rd_gm++; + w->rd_gm_reqs_in_pipe--; + w->outstanding_reqs++; + w->mem_reqs_in_pipe--; + } + + void + Call::MagicAtomicNRAddGroupU32Reg(Wavefront *w, GPUDynInstPtr gpuDynInst) + { + GPUDynInstPtr m = gpuDynInst; + calcAddr(w, m); + + for (int lane = 0; lane < VSZ; ++lane) { + ((int*)m->a_data)[lane] = src1.get<int>(w, lane, 1); + } + + m->m_op = brigAtomicToMemOpType(Brig::BRIG_OPCODE_ATOMICNORET, + Brig::BRIG_ATOMIC_ADD); + m->m_type = U32::memType; + m->v_type = U32::vgprType; + + m->exec_mask = w->execMask(); + m->statusBitVector = 0; + m->equiv = 0; // atomics don't have an equivalence class operand + m->n_reg = 1; + m->memoryOrder = Enums::MEMORY_ORDER_NONE; + m->scope = Enums::MEMORY_SCOPE_NONE; + + m->simdId = w->simdId; + m->wfSlotId = w->wfSlotId; + m->wfDynId = w->wfDynId; + m->latency.init(&w->computeUnit->shader->tick_cnt); + + m->s_type = SEG_GLOBAL; + m->pipeId = GLBMEM_PIPE; + m->latency.set(w->computeUnit->shader->ticks(64)); + w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m); + w->outstanding_reqs_wr_gm++; + w->wr_gm_reqs_in_pipe--; + w->outstanding_reqs_rd_gm++; + w->rd_gm_reqs_in_pipe--; + w->outstanding_reqs++; + w->mem_reqs_in_pipe--; + } + + void + Call::MagicLoadGlobalU32Reg(Wavefront *w, GPUDynInstPtr gpuDynInst) + { + GPUDynInstPtr m = gpuDynInst; + // calculate the address + calcAddr(w, m); + + m->m_op = Enums::MO_LD; + m->m_type = U32::memType; //MemDataType::memType; + m->v_type = U32::vgprType; //DestDataType::vgprType; + + m->exec_mask = w->execMask(); + m->statusBitVector = 0; + m->equiv = 0; + m->n_reg = 1; + m->memoryOrder = Enums::MEMORY_ORDER_NONE; + m->scope = Enums::MEMORY_SCOPE_NONE; + + // FIXME + //m->dst_reg = this->dest.regIndex(); + + m->simdId = w->simdId; + m->wfSlotId = w->wfSlotId; + m->wfDynId = w->wfDynId; + m->latency.init(&w->computeUnit->shader->tick_cnt); + + m->s_type = SEG_GLOBAL; + m->pipeId = GLBMEM_PIPE; + m->latency.set(w->computeUnit->shader->ticks(1)); + w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m); + w->outstanding_reqs_rd_gm++; + w->rd_gm_reqs_in_pipe--; + w->outstanding_reqs++; + w->mem_reqs_in_pipe--; + } + + void + Call::MagicXactCasLd(Wavefront *w) + { + const VectorMask &mask = w->get_pred(); + int src_val1 = 0; + + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + src_val1 = src1.get<int>(w, lane, 1); + break; + } + } + + if (!w->computeUnit->xactCasLoadMap.count(src_val1)) { + w->computeUnit->xactCasLoadMap[src_val1] = ComputeUnit::waveQueue(); + w->computeUnit->xactCasLoadMap[src_val1].waveIDQueue.clear(); + } + + w->computeUnit->xactCasLoadMap[src_val1].waveIDQueue + .push_back(ComputeUnit::waveIdentifier(w->simdId, w->wfSlotId)); + } + + void + Call::MagicMostSigThread(Wavefront *w) + { + const VectorMask &mask = w->get_pred(); + unsigned mst = true; + + for (int lane = VSZ - 1; lane >= 0; --lane) { + if (mask[lane]) { + dest.set<int>(w, lane, mst); + mst = false; + } + } + } + + void + Call::MagicMostSigBroadcast(Wavefront *w) + { + const VectorMask &mask = w->get_pred(); + int res = 0; + bool got_res = false; + + for (int lane = VSZ - 1; lane >= 0; --lane) { + if (mask[lane]) { + if (!got_res) { + res = src1.get<int>(w, lane, 1); + got_res = true; + } + dest.set<int>(w, lane, res); + } + } + } + +} // namespace HsailISA |