diff options
25 files changed, 256 insertions, 193 deletions
diff --git a/configs/example/apu_se.py b/configs/example/apu_se.py index 75819b505..27a26071b 100644 --- a/configs/example/apu_se.py +++ b/configs/example/apu_se.py @@ -250,7 +250,8 @@ for i in xrange(n_cu): vrfs = [] for j in xrange(options.simds_per_cu): for k in xrange(shader.n_wf): - wavefronts.append(Wavefront(simdId = j, wf_slot_id = k)) + wavefronts.append(Wavefront(simdId = j, wf_slot_id = k, + wfSize = options.wf_size)) vrfs.append(VectorRegisterFile(simd_id=j, num_regs_per_simd=options.vreg_file_size)) compute_units[-1].wavefronts = wavefronts diff --git a/src/arch/hsail/gen.py b/src/arch/hsail/gen.py index bb369fd10..f77680541 100755 --- a/src/arch/hsail/gen.py +++ b/src/arch/hsail/gen.py @@ -235,7 +235,7 @@ $class_name::execute(GPUDynInstPtr gpuDynInst) const VectorMask &mask = w->get_pred(); - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (mask[lane]) { DestCType dest_val = $expr; this->dest.set(w, lane, dest_val); @@ -256,7 +256,7 @@ $class_name::execute(GPUDynInstPtr gpuDynInst) const VectorMask &mask = w->get_pred(); - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (mask[lane]) { SrcCType src_val0 = this->src0.get<SrcCType>(w, lane); DestCType dest_val = $expr; @@ -277,7 +277,7 @@ $class_name<DataType>::execute(GPUDynInstPtr gpuDynInst) const VectorMask &mask = w->get_pred(); - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (mask[lane]) { CType dest_val; if ($dest_is_src_flag) { @@ -312,7 +312,7 @@ $class_name<DataType>::execute(GPUDynInstPtr gpuDynInst) const VectorMask &mask = w->get_pred(); - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (mask[lane]) { CType dest_val; @@ -346,7 +346,7 @@ $class_name<DataType>::execute(GPUDynInstPtr gpuDynInst) const VectorMask &mask = w->get_pred(); - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (mask[lane]) { DestT dest_val; if ($dest_is_src_flag) { @@ -372,7 +372,7 @@ $class_name<DataType>::execute(GPUDynInstPtr gpuDynInst) Wavefront *w = gpuDynInst->wavefront(); const VectorMask &mask = w->get_pred(); - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (mask[lane]) { CType dest_val; @@ -401,7 +401,7 @@ $class_name<DestDataType, SrcDataType>::execute(GPUDynInstPtr gpuDynInst) const VectorMask &mask = w->get_pred(); - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (mask[lane]) { DestCType dest_val; SrcCType src_val[$num_srcs]; diff --git a/src/arch/hsail/insts/branch.hh b/src/arch/hsail/insts/branch.hh index f4b00fc8d..45cd876ad 100644 --- a/src/arch/hsail/insts/branch.hh +++ b/src/arch/hsail/insts/branch.hh @@ -279,7 +279,7 @@ namespace HsailISA // taken branch const uint32_t true_pc = getTargetPc(); VectorMask true_mask; - for (unsigned int lane = 0; lane < VSZ; ++lane) { + for (unsigned int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { true_mask[lane] = cond.get<bool>(w, lane) & curr_mask[lane]; } diff --git a/src/arch/hsail/insts/main.cc b/src/arch/hsail/insts/main.cc index 4e70bf46a..004054524 100644 --- a/src/arch/hsail/insts/main.cc +++ b/src/arch/hsail/insts/main.cc @@ -134,7 +134,7 @@ namespace HsailISA const VectorMask &mask = w->get_pred(); // mask off completed work-items - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (mask[lane]) { w->init_mask[lane] = 0; } diff --git a/src/arch/hsail/insts/mem.hh b/src/arch/hsail/insts/mem.hh index f2792cd49..1db98d212 100644 --- a/src/arch/hsail/insts/mem.hh +++ b/src/arch/hsail/insts/mem.hh @@ -457,7 +457,7 @@ namespace HsailISA gpuDynInst->statusBitVector = gpuDynInst->exec_mask; if (num_dest_operands > 1) { - for (int i = 0; i < VSZ; ++i) + for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i) if (gpuDynInst->exec_mask[i]) gpuDynInst->statusVector.push_back(num_dest_operands); else @@ -466,9 +466,10 @@ namespace HsailISA for (int k = 0; k < num_dest_operands; ++k) { - c0 *d = &((c0*)gpuDynInst->d_data)[k * VSZ]; + c0 *d = &((c0*)gpuDynInst->d_data) + [k * gpuDynInst->computeUnit()->wfSize()]; - for (int i = 0; i < VSZ; ++i) { + for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i) { if (gpuDynInst->exec_mask[i]) { Addr vaddr = gpuDynInst->addr[i] + k * sizeof(c0); @@ -1004,7 +1005,7 @@ namespace HsailISA gpuDynInst->statusBitVector = gpuDynInst->exec_mask; if (num_src_operands > 1) { - for (int i = 0; i < VSZ; ++i) + for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i) if (gpuDynInst->exec_mask[i]) gpuDynInst->statusVector.push_back(num_src_operands); else @@ -1012,9 +1013,10 @@ namespace HsailISA } for (int k = 0; k < num_src_operands; ++k) { - c0 *d = &((c0*)gpuDynInst->d_data)[k * VSZ]; + c0 *d = &((c0*)gpuDynInst->d_data) + [k * gpuDynInst->computeUnit()->wfSize()]; - for (int i = 0; i < VSZ; ++i) { + for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i) { if (gpuDynInst->exec_mask[i]) { Addr vaddr = gpuDynInst->addr[i] + k * sizeof(c0); @@ -1402,7 +1404,7 @@ namespace HsailISA c0 *e = &((c0*) gpuDynInst->a_data)[0]; c0 *f = &((c0*) gpuDynInst->x_data)[0]; - for (int i = 0; i < VSZ; ++i) { + for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i) { if (gpuDynInst->exec_mask[i]) { Addr vaddr = gpuDynInst->addr[i]; diff --git a/src/arch/hsail/insts/mem_impl.hh b/src/arch/hsail/insts/mem_impl.hh index 94f0cd6aa..8329c6e8a 100644 --- a/src/arch/hsail/insts/mem_impl.hh +++ b/src/arch/hsail/insts/mem_impl.hh @@ -60,14 +60,16 @@ namespace HsailISA typedef typename DestDataType::CType CType M5_VAR_USED; const VectorMask &mask = w->get_pred(); - uint64_t addr_vec[VSZ]; + std::vector<Addr> addr_vec; + addr_vec.resize(w->computeUnit->wfSize(), (Addr)0); this->addr.calcVector(w, addr_vec); - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (mask[lane]) { this->dest.set(w, lane, addr_vec[lane]); } } + addr_vec.clear(); } template<typename MemDataType, typename DestDataType, @@ -121,8 +123,8 @@ namespace HsailISA i->parent->findSymbol(Brig::BrigPrivateSpace, addr); assert(se); - return w->wfSlotId * w->privSizePerItem * VSZ + - se->offset * VSZ + + return w->wfSlotId * w->privSizePerItem * w->computeUnit->wfSize() + + se->offset * w->computeUnit->wfSize() + lane * se->size; */ @@ -139,9 +141,11 @@ namespace HsailISA Addr addr_div8 = addr / 8; Addr addr_mod8 = addr % 8; - Addr ret = addr_div8 * 8 * VSZ + lane * 8 + addr_mod8 + w->privBase; + Addr ret = addr_div8 * 8 * w->computeUnit->wfSize() + lane * 8 + + addr_mod8 + w->privBase; - assert(ret < w->privBase + (w->privSizePerItem * VSZ)); + assert(ret < w->privBase + + (w->privSizePerItem * w->computeUnit->wfSize())); return ret; } @@ -175,7 +179,7 @@ namespace HsailISA DPRINTF(HSAIL, "ld_kernarg [%d] -> %d\n", address, val); - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (mask[lane]) { this->dest.set(w, lane, val); } @@ -184,7 +188,7 @@ namespace HsailISA return; } else if (this->segment == Brig::BRIG_SEGMENT_ARG) { uint64_t address = this->addr.calcUniform(); - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (mask[lane]) { MemCType val = w->readCallArgMem<MemCType>(lane, address); @@ -239,7 +243,7 @@ namespace HsailISA // this is a complete hack to get around a compiler bug // (the compiler currently generates global access for private // addresses (starting from 0). We need to add the private offset) - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (m->addr[lane] < w->privSizePerItem) { if (mask[lane]) { // what is the size of the object we are accessing? @@ -267,7 +271,7 @@ namespace HsailISA m->pipeId = GLBMEM_PIPE; m->latency.set(w->computeUnit->shader->ticks(1)); { - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { // note: this calculation will NOT WORK if the compiler // ever generates loads/stores to the same address with // different widths (e.g., a ld_u32 addr and a ld_u16 addr) @@ -301,7 +305,7 @@ namespace HsailISA m->pipeId = GLBMEM_PIPE; m->latency.set(w->computeUnit->shader->ticks(1)); - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (mask[lane]) { assert(m->addr[lane] + sizeof(MemCType) <= w->roSize); m->addr[lane] += w->roBase; @@ -318,7 +322,7 @@ namespace HsailISA m->pipeId = GLBMEM_PIPE; m->latency.set(w->computeUnit->shader->ticks(1)); { - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (mask[lane]) { assert(m->addr[lane] < w->privSizePerItem); @@ -360,7 +364,7 @@ namespace HsailISA if (this->segment == Brig::BRIG_SEGMENT_ARG) { uint64_t address = this->addr.calcUniform(); - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (mask[lane]) { CType data = this->src.template get<CType>(w, lane); DPRINTF(HSAIL, "st_arg [%d] <- %d\n", address, data); @@ -378,7 +382,7 @@ namespace HsailISA this->addr.calcVector(w, m->addr); if (num_src_operands == 1) { - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (mask[lane]) { ((CType*)m->d_data)[lane] = this->src.template get<CType>(w, lane); @@ -386,9 +390,9 @@ namespace HsailISA } } else { for (int k= 0; k < num_src_operands; ++k) { - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (mask[lane]) { - ((CType*)m->d_data)[k * VSZ + lane] = + ((CType*)m->d_data)[k * w->computeUnit->wfSize() + lane] = this->src_vect[k].template get<CType>(w, lane); } } @@ -428,7 +432,7 @@ namespace HsailISA // this is a complete hack to get around a compiler bug // (the compiler currently generates global access for private // addresses (starting from 0). We need to add the private offset) - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (mask[lane]) { if (m->addr[lane] < w->privSizePerItem) { @@ -454,7 +458,7 @@ namespace HsailISA m->pipeId = GLBMEM_PIPE; m->latency.set(w->computeUnit->shader->ticks(1)); { - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (mask[lane]) { assert(m->addr[lane] < w->spillSizePerItem); @@ -483,7 +487,7 @@ namespace HsailISA m->pipeId = GLBMEM_PIPE; m->latency.set(w->computeUnit->shader->ticks(1)); { - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (mask[lane]) { assert(m->addr[lane] < w->privSizePerItem); m->addr[lane] = m->addr[lane] + lane * @@ -558,14 +562,14 @@ namespace HsailISA this->addr.calcVector(w, m->addr); - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { ((CType *)m->a_data)[lane] = this->src[0].template get<CType>(w, lane); } // load second source operand for CAS if (NumSrcOperands > 1) { - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { ((CType*)m->x_data)[lane] = this->src[1].template get<CType>(w, lane); } diff --git a/src/arch/hsail/insts/pseudo_inst.cc b/src/arch/hsail/insts/pseudo_inst.cc index 9506a80ab..56ca8047c 100644 --- a/src/arch/hsail/insts/pseudo_inst.cc +++ b/src/arch/hsail/insts/pseudo_inst.cc @@ -84,7 +84,7 @@ namespace HsailISA int op = 0; bool got_op = false; - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (mask[lane]) { int src_val0 = src1.get<int>(w, lane, 0); if (got_op) { @@ -182,7 +182,7 @@ namespace HsailISA { #if TRACING_ON const VectorMask &mask = w->get_pred(); - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (mask[lane]) { int src_val1 = src1.get<int>(w, lane, 1); int src_val2 = src1.get<int>(w, lane, 2); @@ -205,7 +205,7 @@ namespace HsailISA { #if TRACING_ON const VectorMask &mask = w->get_pred(); - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (mask[lane]) { int64_t src_val1 = src1.get<int64_t>(w, lane, 1); int src_val2 = src1.get<int>(w, lane, 2); @@ -231,7 +231,7 @@ namespace HsailISA std::string res_str; res_str = csprintf("krl_prt (%s)\n", disassemble()); - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (!(lane & 7)) { res_str += csprintf("DB%03d: ", (int)w->wfDynId); } @@ -270,7 +270,7 @@ namespace HsailISA int src_val3 = -1; res_str = csprintf("krl_prt (%s)\n", disassemble()); - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (!(lane & 7)) { res_str += csprintf("DB%03d: ", (int)w->wfDynId); } @@ -311,7 +311,7 @@ namespace HsailISA std::string res_str; res_str = csprintf("krl_prt (%s)\n", disassemble()); - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (!(lane & 3)) { res_str += csprintf("DB%03d: ", (int)w->wfDynId); } @@ -350,7 +350,7 @@ namespace HsailISA int src_val3 = -1; res_str = csprintf("krl_prt (%s)\n", disassemble()); - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (!(lane & 3)) { res_str += csprintf("DB%03d: ", (int)w->wfDynId); } @@ -391,7 +391,7 @@ namespace HsailISA std::string res_str; res_str = csprintf("krl_prt (%s)\n", disassemble()); - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (!(lane & 7)) { res_str += csprintf("DB%03d: ", (int)w->wfDynId); } @@ -430,7 +430,7 @@ namespace HsailISA res_str += csprintf(" Executing on CU #%i\n", w->computeUnit->cu_id); res_str += csprintf(" Exec mask: "); - for (int i = VSZ - 1; i >= 0; --i) { + for (int i = w->computeUnit->wfSize() - 1; i >= 0; --i) { if (w->execMask(i)) res_str += "1"; else @@ -458,7 +458,7 @@ namespace HsailISA const VectorMask &mask = w->get_pred(); int res = 0; - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (mask[lane]) { int src_val1 = src1.get<int>(w, lane, 1); dest.set<int>(w, lane, res); @@ -477,14 +477,14 @@ namespace HsailISA const VectorMask &mask = w->get_pred(); int res = 0; - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (mask[lane]) { int src_val1 = src1.get<int>(w, lane, 1); res += src_val1; } } - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (mask[lane]) { dest.set<int>(w, lane, res); } @@ -497,19 +497,19 @@ namespace HsailISA const VectorMask &mask = w->get_pred(); int res = 0; - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (mask[lane]) { int src_val1 = src1.get<int>(w, lane, 1); if (src_val1) { - if (lane < (VSZ/2)) { + if (lane < (w->computeUnit->wfSize()/2)) { res = res | ((uint32_t)(1) << lane); } } } } - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (mask[lane]) { dest.set<int>(w, lane, res); } @@ -521,19 +521,20 @@ namespace HsailISA { const VectorMask &mask = w->get_pred(); int res = 0; - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (mask[lane]) { int src_val1 = src1.get<int>(w, lane, 1); if (src_val1) { - if (lane >= (VSZ/2)) { - res = res | ((uint32_t)(1) << (lane - (VSZ/2))); + if (lane >= (w->computeUnit->wfSize()/2)) { + res = res | ((uint32_t)(1) << + (lane - (w->computeUnit->wfSize()/2))); } } } } - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (mask[lane]) { dest.set<int>(w, lane, res); } @@ -546,7 +547,7 @@ namespace HsailISA const VectorMask &mask = w->get_pred(); int max_cnt = 0; - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (mask[lane]) { w->bar_cnt[lane]++; @@ -567,7 +568,7 @@ namespace HsailISA const VectorMask &mask = w->get_pred(); int max_cnt = 0; - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (mask[lane]) { w->bar_cnt[lane]--; } @@ -592,7 +593,7 @@ namespace HsailISA { const VectorMask &mask = w->get_pred(); - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (mask[lane]) { int src_val1 = src1.get<int>(w, lane, 1); panic("OpenCL Code failed assertion #%d. Triggered by lane %s", @@ -605,7 +606,7 @@ namespace HsailISA Call::calcAddr(Wavefront *w, GPUDynInstPtr m) { // the address is in src1 | src2 - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { int src_val1 = src1.get<int>(w, lane, 1); int src_val2 = src1.get<int>(w, lane, 2); Addr addr = (((Addr) src_val1) << 32) | ((Addr) src_val2); @@ -622,7 +623,7 @@ namespace HsailISA calcAddr(w, m); - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { ((int*)m->a_data)[lane] = src1.get<int>(w, lane, 3); } @@ -661,7 +662,7 @@ namespace HsailISA GPUDynInstPtr m = gpuDynInst; calcAddr(w, m); - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { ((int*)m->a_data)[lane] = src1.get<int>(w, lane, 1); } @@ -736,7 +737,7 @@ namespace HsailISA const VectorMask &mask = w->get_pred(); int src_val1 = 0; - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (mask[lane]) { src_val1 = src1.get<int>(w, lane, 1); break; @@ -758,7 +759,7 @@ namespace HsailISA const VectorMask &mask = w->get_pred(); unsigned mst = true; - for (int lane = VSZ - 1; lane >= 0; --lane) { + for (int lane = w->computeUnit->wfSize() - 1; lane >= 0; --lane) { if (mask[lane]) { dest.set<int>(w, lane, mst); mst = false; @@ -773,7 +774,7 @@ namespace HsailISA int res = 0; bool got_res = false; - for (int lane = VSZ - 1; lane >= 0; --lane) { + for (int lane = w->computeUnit->wfSize() - 1; lane >= 0; --lane) { if (mask[lane]) { if (!got_res) { res = src1.get<int>(w, lane, 1); diff --git a/src/arch/hsail/operand.hh b/src/arch/hsail/operand.hh index e3d275b10..4d981ee00 100644 --- a/src/arch/hsail/operand.hh +++ b/src/arch/hsail/operand.hh @@ -42,6 +42,7 @@ * Defines classes encapsulating HSAIL instruction operands. */ +#include <limits> #include <string> #include "arch/hsail/Brig.h" @@ -346,6 +347,8 @@ class CRegOperand : public BaseRegOperand template<typename T> class ImmOperand : public BaseOperand { + private: + uint16_t kind; public: T bits; @@ -355,11 +358,21 @@ class ImmOperand : public BaseOperand template<typename OperandType> OperandType - get() + get(Wavefront *w) { assert(sizeof(OperandType) <= sizeof(T)); + panic_if(w == nullptr, "WF pointer needs to be set"); + + switch (kind) { + // immediate operand is WF size + case Brig::BRIG_KIND_OPERAND_WAVESIZE: + return (OperandType)w->computeUnit->wfSize(); + break; - return *(OperandType*)&bits; + default: + return *(OperandType*)&bits; + break; + } } // This version of get() takes a WF* and a lane id for @@ -368,7 +381,7 @@ class ImmOperand : public BaseOperand OperandType get(Wavefront *w, int lane) { - return get<OperandType>(); + return get<OperandType>(w); } }; @@ -388,16 +401,18 @@ ImmOperand<T>::init(unsigned opOffset, const BrigObject *obj) auto cbptr = (Brig::BrigOperandConstantBytes*)brigOp; bits = *((T*)(obj->getData(cbptr->bytes + 4))); - + kind = brigOp->kind; return true; } break; case Brig::BRIG_KIND_OPERAND_WAVESIZE: - bits = VSZ; + kind = brigOp->kind; + bits = std::numeric_limits<unsigned long long>::digits; return true; default: + kind = Brig::BRIG_KIND_NONE; return false; } } @@ -409,6 +424,7 @@ ImmOperand<T>::init_from_vect(unsigned opOffset, const BrigObject *obj, int at) const Brig::BrigOperand *brigOp = obj->getOperand(opOffset); if (brigOp->kind != Brig::BRIG_KIND_OPERAND_OPERAND_LIST) { + kind = Brig::BRIG_KIND_NONE; return false; } @@ -423,6 +439,7 @@ ImmOperand<T>::init_from_vect(unsigned opOffset, const BrigObject *obj, int at) (const Brig::BrigOperand *)obj->getOperand(*data_offset); if (p->kind != Brig::BRIG_KIND_OPERAND_CONSTANT_BYTES) { + kind = Brig::BRIG_KIND_NONE; return false; } @@ -456,7 +473,7 @@ class RegOrImmOperand : public BaseOperand OperandType get(Wavefront *w, int lane) { - return is_imm ? imm_op.template get<OperandType>() : + return is_imm ? imm_op.template get<OperandType>(w) : reg_op.template get<OperandType>(w, lane); } @@ -571,7 +588,7 @@ class AddrOperandBase : public BaseOperand uint64_t calcUniformBase(); public: - virtual void calcVector(Wavefront *w, uint64_t *addrVec) = 0; + virtual void calcVector(Wavefront *w, std::vector<Addr> &addrVec) = 0; virtual uint64_t calcLane(Wavefront *w, int lane=0) = 0; uint64_t offset; @@ -586,7 +603,7 @@ class RegAddrOperand : public AddrOperandBase RegOperandType reg; void init(unsigned opOffset, const BrigObject *obj); uint64_t calcUniform(); - void calcVector(Wavefront *w, uint64_t *addrVec); + void calcVector(Wavefront *w, std::vector<Addr> &addrVec); uint64_t calcLane(Wavefront *w, int lane=0); uint32_t opSize() { return reg.opSize(); } bool isVectorRegister() { return reg.registerType == Enums::RT_VECTOR; } @@ -641,11 +658,12 @@ RegAddrOperand<RegOperandType>::calcUniform() template<typename RegOperandType> void -RegAddrOperand<RegOperandType>::calcVector(Wavefront *w, uint64_t *addrVec) +RegAddrOperand<RegOperandType>::calcVector(Wavefront *w, + std::vector<Addr> &addrVec) { Addr address = calcUniformBase(); - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (w->execMask(lane)) { if (reg.regFileChar == 's') { addrVec[lane] = address + reg.template get<uint32_t>(w, lane); @@ -680,7 +698,7 @@ class NoRegAddrOperand : public AddrOperandBase public: void init(unsigned opOffset, const BrigObject *obj); uint64_t calcUniform(); - void calcVector(Wavefront *w, uint64_t *addrVec); + void calcVector(Wavefront *w, std::vector<Addr> &addrVec); uint64_t calcLane(Wavefront *w, int lane=0); std::string disassemble(); }; @@ -698,11 +716,11 @@ NoRegAddrOperand::calcLane(Wavefront *w, int lane) } inline void -NoRegAddrOperand::calcVector(Wavefront *w, uint64_t *addrVec) +NoRegAddrOperand::calcVector(Wavefront *w, std::vector<Addr> &addrVec) { uint64_t address = calcUniformBase(); - for (int lane = 0; lane < VSZ; ++lane) + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) addrVec[lane] = address; } diff --git a/src/gpu-compute/GPU.py b/src/gpu-compute/GPU.py index bd95f6335..f580a09f7 100644 --- a/src/gpu-compute/GPU.py +++ b/src/gpu-compute/GPU.py @@ -59,6 +59,7 @@ class VectorRegisterFile(SimObject): simd_id = Param.Int(0, 'SIMD ID associated with this VRF') num_regs_per_simd = Param.Int(2048, 'number of vector registers per SIMD') + wfSize = Param.Int(64, 'Wavefront size (in work items)') min_alloc = Param.Int(4, 'min number of VGPRs allocated per WF') class Wavefront(SimObject): @@ -68,6 +69,7 @@ class Wavefront(SimObject): simdId = Param.Int('SIMD id (0-ComputeUnit.num_SIMDs)') wf_slot_id = Param.Int('wavefront id (0-ComputeUnit.max_wfs)') + wfSize = Param.Int(64, 'Wavefront size (in work items)') class ComputeUnit(MemObject): type = 'ComputeUnit' diff --git a/src/gpu-compute/cl_driver.cc b/src/gpu-compute/cl_driver.cc index 3b3291c03..6bb6be102 100644 --- a/src/gpu-compute/cl_driver.cc +++ b/src/gpu-compute/cl_driver.cc @@ -238,7 +238,7 @@ ClDriver::ioctl(LiveProcess *process, ThreadContext *tc, unsigned req) case HSA_GET_VSZ: { BufferArg buf(buf_addr, sizeof(uint32_t)); - *((uint32_t*)buf.bufferPtr()) = VSZ; + *((uint32_t*)buf.bufferPtr()) = dispatcher->wfSize(); buf.copyOut(tc->getMemProxy()); } break; diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc index b3a99b182..5ec061172 100644 --- a/src/gpu-compute/compute_unit.cc +++ b/src/gpu-compute/compute_unit.cc @@ -32,9 +32,10 @@ * * Author: John Kalamatianos, Anthony Gutierrez */ - #include "gpu-compute/compute_unit.hh" +#include <limits> + #include "base/output.hh" #include "debug/GPUDisp.hh" #include "debug/GPUExec.hh" @@ -76,14 +77,27 @@ ComputeUnit::ComputeUnit(const Params *p) : MemObject(p), fetchStage(p), _masterId(p->system->getMasterId(name() + ".ComputeUnit")), lds(*p->localDataStore), globalSeqNum(0), wavefrontSize(p->wfSize) { - // this check will be eliminated once we have wavefront size support added - fatal_if(p->wfSize != VSZ, "Wavefront size parameter does not match VSZ"); + /** + * This check is necessary because std::bitset only provides conversion + * to unsigned long or unsigned long long via to_ulong() or to_ullong(). + * there are * a few places in the code where to_ullong() is used, however + * if VSZ is larger than a value the host can support then bitset will + * throw a runtime exception. we should remove all use of to_long() or + * to_ullong() so we can have VSZ greater than 64b, however until that is + * done this assert is required. + */ + fatal_if(p->wfSize > std::numeric_limits<unsigned long long>::digits || + p->wfSize <= 0, + "WF size is larger than the host can support"); + fatal_if(!isPowerOf2(wavefrontSize), + "Wavefront size should be a power of 2"); // calculate how many cycles a vector load or store will need to transfer // its data over the corresponding buses - numCyclesPerStoreTransfer = (uint32_t)ceil((double)(VSZ * sizeof(uint32_t)) - / (double)vrfToCoalescerBusWidth); + numCyclesPerStoreTransfer = + (uint32_t)ceil((double)(wfSize() * sizeof(uint32_t)) / + (double)vrfToCoalescerBusWidth); - numCyclesPerLoadTransfer = (VSZ * sizeof(uint32_t)) + numCyclesPerLoadTransfer = (wfSize() * sizeof(uint32_t)) / coalescerToVrfBusWidth; lastVaddrWF.resize(numSIMDs); @@ -93,24 +107,24 @@ ComputeUnit::ComputeUnit(const Params *p) : MemObject(p), fetchStage(p), lastVaddrWF[j].resize(p->n_wf); for (int i = 0; i < p->n_wf; ++i) { - lastVaddrWF[j][i].resize(VSZ); + lastVaddrWF[j][i].resize(wfSize()); wfList[j].push_back(p->wavefronts[j * p->n_wf + i]); wfList[j][i]->setParent(this); - for (int k = 0; k < VSZ; ++k) { + for (int k = 0; k < wfSize(); ++k) { lastVaddrWF[j][i][k] = 0; } } } - lastVaddrPhase.resize(numSIMDs); + lastVaddrSimd.resize(numSIMDs); for (int i = 0; i < numSIMDs; ++i) { - lastVaddrPhase[i] = LastVaddrWave(); + lastVaddrSimd[i].resize(wfSize(), 0); } - lastVaddrCU = LastVaddrWave(); + lastVaddrCU.resize(wfSize()); lds.setParent(this); @@ -122,10 +136,10 @@ ComputeUnit::ComputeUnit(const Params *p) : MemObject(p), fetchStage(p), fatal("Invalid WF execution policy (CU)\n"); } - memPort.resize(VSZ); + memPort.resize(wfSize()); // resize the tlbPort vectorArray - int tlbPort_width = perLaneTLB ? VSZ : 1; + int tlbPort_width = perLaneTLB ? wfSize() : 1; tlbPort.resize(tlbPort_width); cuExitCallback = new CUExitCallback(this); @@ -144,12 +158,13 @@ ComputeUnit::ComputeUnit(const Params *p) : MemObject(p), fetchStage(p), ComputeUnit::~ComputeUnit() { // Delete wavefront slots - - for (int j = 0; j < numSIMDs; ++j) + for (int j = 0; j < numSIMDs; ++j) { for (int i = 0; i < shader->n_wf; ++i) { delete wfList[j][i]; } - + lastVaddrSimd[j].clear(); + } + lastVaddrCU.clear(); readyList.clear(); waveStatusList.clear(); dispatchList.clear(); @@ -187,27 +202,25 @@ ComputeUnit::InitializeWFContext(WFContext *wfCtx, NDRange *ndr, int cnt, VectorMask init_mask; init_mask.reset(); - for (int k = 0; k < VSZ; ++k) { - if (k + cnt * VSZ < trueWgSizeTotal) + for (int k = 0; k < wfSize(); ++k) { + if (k + cnt * wfSize() < trueWgSizeTotal) init_mask[k] = 1; } wfCtx->init_mask = init_mask.to_ullong(); wfCtx->exec_mask = init_mask.to_ullong(); - for (int i = 0; i < VSZ; ++i) { - wfCtx->bar_cnt[i] = 0; - } + wfCtx->bar_cnt.resize(wfSize(), 0); wfCtx->max_bar_cnt = 0; wfCtx->old_barrier_cnt = 0; wfCtx->barrier_cnt = 0; wfCtx->privBase = ndr->q.privMemStart; - ndr->q.privMemStart += ndr->q.privMemPerItem * VSZ; + ndr->q.privMemStart += ndr->q.privMemPerItem * wfSize(); wfCtx->spillBase = ndr->q.spillMemStart; - ndr->q.spillMemStart += ndr->q.spillMemPerItem * VSZ; + ndr->q.spillMemStart += ndr->q.spillMemPerItem * wfSize(); wfCtx->pc = 0; wfCtx->rpc = UINT32_MAX; @@ -265,10 +278,12 @@ ComputeUnit::StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[], w->dynwaveid = cnt; w->init_mask = wfCtx->init_mask; - for (int k = 0; k < VSZ; ++k) { - w->workitemid[0][k] = (k+cnt*VSZ) % trueWgSize[0]; - w->workitemid[1][k] = ((k + cnt * VSZ) / trueWgSize[0]) % trueWgSize[1]; - w->workitemid[2][k] = (k + cnt * VSZ) / (trueWgSize[0] * trueWgSize[1]); + for (int k = 0; k < wfSize(); ++k) { + w->workitemid[0][k] = (k+cnt*wfSize()) % trueWgSize[0]; + w->workitemid[1][k] = + ((k + cnt * wfSize()) / trueWgSize[0]) % trueWgSize[1]; + w->workitemid[2][k] = + (k + cnt * wfSize()) / (trueWgSize[0] * trueWgSize[1]); w->workitemFlatId[k] = w->workitemid[2][k] * trueWgSize[0] * trueWgSize[1] + w->workitemid[1][k] * trueWgSize[0] + @@ -277,9 +292,9 @@ ComputeUnit::StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[], w->old_barrier_cnt = wfCtx->old_barrier_cnt; w->barrier_cnt = wfCtx->barrier_cnt; - w->barrier_slots = divCeil(trueWgSizeTotal, VSZ); + w->barrier_slots = divCeil(trueWgSizeTotal, wfSize()); - for (int i = 0; i < VSZ; ++i) { + for (int i = 0; i < wfSize(); ++i) { w->bar_cnt[i] = wfCtx->bar_cnt[i]; } @@ -315,16 +330,17 @@ ComputeUnit::StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[], // is this the last wavefront in the workgroup // if set the spillWidth to be the remaining work-items // so that the vector access is correct - if ((cnt + 1) * VSZ >= trueWgSizeTotal) { - w->spillWidth = trueWgSizeTotal - (cnt * VSZ); + if ((cnt + 1) * wfSize() >= trueWgSizeTotal) { + w->spillWidth = trueWgSizeTotal - (cnt * wfSize()); } else { - w->spillWidth = VSZ; + w->spillWidth = wfSize(); } DPRINTF(GPUDisp, "Scheduling wfDynId/barrier_id %d/%d on CU%d: " "WF[%d][%d]\n", _n_wave, barrier_id, cu_id, w->simdId, w->wfSlotId); w->start(++_n_wave, ndr->q.code_ptr); + wfCtx->bar_cnt.clear(); } void @@ -339,7 +355,7 @@ ComputeUnit::StartWorkgroup(NDRange *ndr) // Send L1 cache acquire // isKernel + isAcquire = Kernel Begin if (shader->impl_kern_boundary_sync) { - GPUDynInstPtr gpuDynInst = std::make_shared<GPUDynInst>(nullptr, + GPUDynInstPtr gpuDynInst = std::make_shared<GPUDynInst>(this, nullptr, nullptr, 0); @@ -374,7 +390,7 @@ ComputeUnit::StartWorkgroup(NDRange *ndr) if (w->status == Wavefront::S_STOPPED) { // if we have scheduled all work items then stop // scheduling wavefronts - if (cnt * VSZ >= trueWgSizeTotal) + if (cnt * wfSize() >= trueWgSizeTotal) break; // reserve vector registers for the scheduled wavefront @@ -420,7 +436,7 @@ ComputeUnit::ReadyWorkgroup(NDRange *ndr) // work item of the work group int vregDemandPerWI = ndr->q.sRegCount + (2 * ndr->q.dRegCount); bool vregAvail = true; - int numWfs = (trueWgSizeTotal + VSZ - 1) / VSZ; + int numWfs = (trueWgSizeTotal + wfSize() - 1) / wfSize(); int freeWfSlots = 0; // check if the total number of VGPRs required by all WFs of the WG // fit in the VRFs of all SIMD units @@ -623,7 +639,7 @@ ComputeUnit::init() // Setup space for call args for (int j = 0; j < numSIMDs; ++j) { for (int i = 0; i < shader->n_wf; ++i) { - wfList[j][i]->initCallArgMem(shader->funcargs_size); + wfList[j][i]->initCallArgMem(shader->funcargs_size, wavefrontSize); } } @@ -1193,15 +1209,15 @@ ComputeUnit::DTLBPort::recvTimingResp(PacketPtr pkt) Addr last = 0; switch(computeUnit->prefetchType) { - case Enums::PF_CU: + case Enums::PF_CU: last = computeUnit->lastVaddrCU[mp_index]; break; - case Enums::PF_PHASE: - last = computeUnit->lastVaddrPhase[simdId][mp_index]; + case Enums::PF_PHASE: + last = computeUnit->lastVaddrSimd[simdId][mp_index]; break; - case Enums::PF_WF: + case Enums::PF_WF: last = computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index]; - default: + default: break; } @@ -1215,7 +1231,7 @@ ComputeUnit::DTLBPort::recvTimingResp(PacketPtr pkt) DPRINTF(GPUPrefetch, "Stride is %d\n", stride); computeUnit->lastVaddrCU[mp_index] = vaddr; - computeUnit->lastVaddrPhase[simdId][mp_index] = vaddr; + computeUnit->lastVaddrSimd[simdId][mp_index] = vaddr; computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index] = vaddr; stride = (computeUnit->prefetchType == Enums::PF_STRIDE) ? @@ -1488,7 +1504,7 @@ ComputeUnit::regStats() ; ldsBankConflictDist - .init(0, VSZ, 2) + .init(0, wfSize(), 2) .name(name() + ".lds_bank_conflicts") .desc("Number of bank conflicts per LDS memory packet") ; @@ -1499,27 +1515,28 @@ ComputeUnit::regStats() ; pageDivergenceDist - // A wavefront can touch 1 to VSZ pages per memory instruction. - // The number of pages per bin can be configured (here it's 4). - .init(1, VSZ, 4) + // A wavefront can touch up to N pages per memory instruction where + // N is equal to the wavefront size + // The number of pages per bin can be configured (here it's 4). + .init(1, wfSize(), 4) .name(name() + ".page_divergence_dist") .desc("pages touched per wf (over all mem. instr.)") ; controlFlowDivergenceDist - .init(1, VSZ, 4) + .init(1, wfSize(), 4) .name(name() + ".warp_execution_dist") .desc("number of lanes active per instruction (oval all instructions)") ; activeLanesPerGMemInstrDist - .init(1, VSZ, 4) + .init(1, wfSize(), 4) .name(name() + ".gmem_lanes_execution_dist") .desc("number of active lanes per global memory instruction") ; activeLanesPerLMemInstrDist - .init(1, VSZ, 4) + .init(1, wfSize(), 4) .name(name() + ".lmem_lanes_execution_dist") .desc("number of active lanes per local memory instruction") ; @@ -1531,7 +1548,7 @@ ComputeUnit::regStats() numVecOpsExecuted .name(name() + ".num_vec_ops_executed") - .desc("number of vec ops executed (e.g. VSZ/inst)") + .desc("number of vec ops executed (e.g. WF size/inst)") ; totalCycles diff --git a/src/gpu-compute/compute_unit.hh b/src/gpu-compute/compute_unit.hh index f47c27a0a..a234cbeb5 100644 --- a/src/gpu-compute/compute_unit.hh +++ b/src/gpu-compute/compute_unit.hh @@ -161,22 +161,8 @@ class ComputeUnit : public MemObject // if fixed-stride prefetching, this is the stride. int prefetchStride; - class LastVaddrWave - { - public: - Addr vaddrs[VSZ]; - Addr& operator[](int idx) { - return vaddrs[idx]; - } - - LastVaddrWave() { - for (int i = 0; i < VSZ; ++i) - vaddrs[i] = 0; - } - }; - - LastVaddrWave lastVaddrCU; - std::vector<LastVaddrWave> lastVaddrPhase; + std::vector<Addr> lastVaddrCU; + std::vector<std::vector<Addr>> lastVaddrSimd; std::vector<std::vector<std::vector<Addr>>> lastVaddrWF; Enums::PrefetchType prefetchType; EXEC_POLICY exec_policy; diff --git a/src/gpu-compute/dispatcher.cc b/src/gpu-compute/dispatcher.cc index 95c0c56a2..d1d011c0d 100644 --- a/src/gpu-compute/dispatcher.cc +++ b/src/gpu-compute/dispatcher.cc @@ -387,6 +387,12 @@ GpuDispatcher::getNumCUs() return shader->cuList.size(); } +int +GpuDispatcher::wfSize() const +{ + return shader->cuList[0]->wfSize(); +} + void GpuDispatcher::setFuncargsSize(int funcargs_size) { diff --git a/src/gpu-compute/dispatcher.hh b/src/gpu-compute/dispatcher.hh index 76f932655..e984af494 100644 --- a/src/gpu-compute/dispatcher.hh +++ b/src/gpu-compute/dispatcher.hh @@ -157,6 +157,7 @@ class GpuDispatcher : public DmaDevice // helper functions to retrieve/set GPU attributes int getNumCUs(); + int wfSize() const; void setFuncargsSize(int funcargs_size); }; diff --git a/src/gpu-compute/global_memory_pipeline.cc b/src/gpu-compute/global_memory_pipeline.cc index 355018666..a6a4d86db 100644 --- a/src/gpu-compute/global_memory_pipeline.cc +++ b/src/gpu-compute/global_memory_pipeline.cc @@ -179,9 +179,9 @@ GlobalMemPipeline::doGmReturn(GPUDynInstPtr m) int physVgpr = w->remap(dst, sizeof(c0), 1); // save the physical VGPR index regVec.push_back(physVgpr); - c1 *p1 = &((c1*)m->d_data)[k * VSZ]; + c1 *p1 = &((c1 *)m->d_data)[k * w->computeUnit->wfSize()]; - for (int i = 0; i < VSZ; ++i) { + for (int i = 0; i < w->computeUnit->wfSize(); ++i) { if (m->exec_mask[i]) { DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: " "$%s%d <- %d global ld done (src = wavefront " diff --git a/src/gpu-compute/gpu_dyn_inst.cc b/src/gpu-compute/gpu_dyn_inst.cc index 2f35a983c..1806e79e4 100644 --- a/src/gpu-compute/gpu_dyn_inst.cc +++ b/src/gpu-compute/gpu_dyn_inst.cc @@ -42,11 +42,29 @@ GPUDynInst::GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *_staticInst, uint64_t instSeqNum) - : GPUExecContext(_cu, _wf), m_op(Enums::MO_UNDEF), + : GPUExecContext(_cu, _wf), addr(computeUnit()->wfSize(), (Addr)0), + m_op(Enums::MO_UNDEF), memoryOrder(Enums::MEMORY_ORDER_NONE), n_reg(0), useContinuation(false), statusBitVector(0), staticInst(_staticInst), _seqNum(instSeqNum) { - tlbHitLevel.assign(VSZ, -1); + tlbHitLevel.assign(computeUnit()->wfSize(), -1); + d_data = new uint8_t[computeUnit()->wfSize() * 16]; + a_data = new uint8_t[computeUnit()->wfSize() * 8]; + x_data = new uint8_t[computeUnit()->wfSize() * 8]; + for (int i = 0; i < (computeUnit()->wfSize() * 8); ++i) { + a_data[i] = 0; + x_data[i] = 0; + } + for (int i = 0; i < (computeUnit()->wfSize() * 16); ++i) { + d_data[i] = 0; + } +} + +GPUDynInst::~GPUDynInst() +{ + delete[] d_data; + delete[] a_data; + delete[] x_data; } void diff --git a/src/gpu-compute/gpu_dyn_inst.hh b/src/gpu-compute/gpu_dyn_inst.hh index e44d8f80d..46774d867 100644 --- a/src/gpu-compute/gpu_dyn_inst.hh +++ b/src/gpu-compute/gpu_dyn_inst.hh @@ -205,7 +205,7 @@ class GPUDynInst : public GPUExecContext public: GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *_staticInst, uint64_t instSeqNum); - + ~GPUDynInst(); void execute(); int numSrcRegOperands(); int numDstRegOperands(); @@ -226,15 +226,15 @@ class GPUDynInst : public GPUExecContext Enums::StorageClassType executedAs(); // The address of the memory operation - Addr addr[VSZ]; + std::vector<Addr> addr; Addr pAddr; // The data to get written - uint8_t d_data[VSZ * 16]; + uint8_t *d_data; // Additional data (for atomics) - uint8_t a_data[VSZ * 8]; + uint8_t *a_data; // Additional data (for atomics) - uint8_t x_data[VSZ * 8]; + uint8_t *x_data; // The execution mask VectorMask exec_mask; diff --git a/src/gpu-compute/local_memory_pipeline.cc b/src/gpu-compute/local_memory_pipeline.cc index 7f919c5f4..a970d8f9b 100644 --- a/src/gpu-compute/local_memory_pipeline.cc +++ b/src/gpu-compute/local_memory_pipeline.cc @@ -148,9 +148,9 @@ LocalMemPipeline::doSmReturn(GPUDynInstPtr m) int physVgpr = w->remap(dst,sizeof(c0),1); // save the physical VGPR index regVec.push_back(physVgpr); - c1 *p1 = &((c1*)m->d_data)[k * VSZ]; + c1 *p1 = &((c1 *)m->d_data)[k * w->computeUnit->wfSize()]; - for (int i = 0; i < VSZ; ++i) { + for (int i = 0; i < w->computeUnit->wfSize(); ++i) { if (m->exec_mask[i]) { // write the value into the physical VGPR. This is a purely // functional operation. No timing is modeled. diff --git a/src/gpu-compute/misc.hh b/src/gpu-compute/misc.hh index 4f8032832..5ade89789 100644 --- a/src/gpu-compute/misc.hh +++ b/src/gpu-compute/misc.hh @@ -37,28 +37,14 @@ #define __MISC_HH__ #include <bitset> +#include <limits> #include <memory> #include "base/misc.hh" class GPUDynInst; -// wavefront size of the machine -static const int VSZ = 64; - -/* - This check is necessary because std::bitset only provides conversion to - unsigned long or unsigned long long via to_ulong() or to_ullong(). there are - a few places in the code where to_ullong() is used, however if VSZ is larger - than a value the host can support then bitset will throw a runtime exception. - - we should remove all use of to_long() or to_ullong() so we can have VSZ - greater than 64b, however until that is done this assert is required. - */ -static_assert(VSZ <= sizeof(unsigned long long) * 8, - "VSZ is larger than the host can support"); - -typedef std::bitset<VSZ> VectorMask; +typedef std::bitset<std::numeric_limits<unsigned long long>::digits> VectorMask; typedef std::shared_ptr<GPUDynInst> GPUDynInstPtr; class WaitClass diff --git a/src/gpu-compute/qstruct.hh b/src/gpu-compute/qstruct.hh index 092303c00..7bca757b8 100644 --- a/src/gpu-compute/qstruct.hh +++ b/src/gpu-compute/qstruct.hh @@ -100,7 +100,7 @@ struct WFContext { // 32 bit values // barrier state - int bar_cnt[VSZ]; + std::vector<int> bar_cnt; // id (which WF in the WG) int cnt; diff --git a/src/gpu-compute/vector_register_file.cc b/src/gpu-compute/vector_register_file.cc index 8b7dc0691..c43d765af 100644 --- a/src/gpu-compute/vector_register_file.cc +++ b/src/gpu-compute/vector_register_file.cc @@ -63,7 +63,7 @@ VectorRegisterFile::VectorRegisterFile(const VectorRegisterFileParams *p) nxtBusy.clear(); nxtBusy.resize(numRegsPerSimd, 0); - vgprState->init(numRegsPerSimd); + vgprState->init(numRegsPerSimd, p->wfSize); } void diff --git a/src/gpu-compute/vector_register_state.cc b/src/gpu-compute/vector_register_state.cc index f231b0579..e177d3b64 100644 --- a/src/gpu-compute/vector_register_state.cc +++ b/src/gpu-compute/vector_register_state.cc @@ -35,6 +35,8 @@ #include "gpu-compute/vector_register_state.hh" +#include <limits> + #include "gpu-compute/compute_unit.hh" VecRegisterState::VecRegisterState() : computeUnit(nullptr) @@ -51,8 +53,19 @@ VecRegisterState::setParent(ComputeUnit *_computeUnit) } void -VecRegisterState::init(uint32_t _size) +VecRegisterState::init(uint32_t _size, uint32_t wf_size) { s_reg.resize(_size); + fatal_if(wf_size > std::numeric_limits<unsigned long long>::digits || + wf_size <= 0, + "WF size is larger than the host can support or is zero"); + fatal_if((wf_size & (wf_size - 1)) != 0, + "Wavefront size should be a power of 2"); + for (int i = 0; i < s_reg.size(); ++i) { + s_reg[i].resize(wf_size, 0); + } d_reg.resize(_size); + for (int i = 0; i < d_reg.size(); ++i) { + d_reg[i].resize(wf_size, 0); + } } diff --git a/src/gpu-compute/vector_register_state.hh b/src/gpu-compute/vector_register_state.hh index a233b9acc..97a0d8e25 100644 --- a/src/gpu-compute/vector_register_state.hh +++ b/src/gpu-compute/vector_register_state.hh @@ -51,7 +51,7 @@ class VecRegisterState { public: VecRegisterState(); - void init(uint32_t _size); + void init(uint32_t _size, uint32_t wf_size); const std::string& name() const { return _name; } void setParent(ComputeUnit *_computeUnit); @@ -93,9 +93,9 @@ class VecRegisterState ComputeUnit *computeUnit; std::string _name; // 32-bit Single Precision Vector Register State - std::vector<std::array<uint32_t, VSZ>> s_reg; + std::vector<std::vector<uint32_t>> s_reg; // 64-bit Double Precision Vector Register State - std::vector<std::array<uint64_t, VSZ>> d_reg; + std::vector<std::vector<uint64_t>> d_reg; }; #endif // __VECTOR_REGISTER_STATE_HH__ diff --git a/src/gpu-compute/wavefront.cc b/src/gpu-compute/wavefront.cc index 7cdec53e5..a20330082 100644 --- a/src/gpu-compute/wavefront.cc +++ b/src/gpu-compute/wavefront.cc @@ -55,7 +55,6 @@ Wavefront::Wavefront(const Params *p) last_trace = 0; simdId = p->simdId; wfSlotId = p->wf_slot_id; - status = S_STOPPED; reservedVectorRegs = 0; startVgprIndex = 0; @@ -77,12 +76,20 @@ Wavefront::Wavefront(const Params *p) mem_trace_busy = 0; old_vgpr_tcnt = 0xffffffffffffffffll; old_dgpr_tcnt = 0xffffffffffffffffll; + old_vgpr.resize(p->wfSize); pendingFetch = false; dropFetch = false; condRegState = new ConditionRegisterState(); maxSpVgprs = 0; maxDpVgprs = 0; + last_addr.resize(p->wfSize); + workitemFlatId.resize(p->wfSize); + old_dgpr.resize(p->wfSize); + bar_cnt.resize(p->wfSize); + for (int i = 0; i < 3; ++i) { + workitemid[i].resize(p->wfSize); + } } void @@ -144,6 +151,7 @@ Wavefront::~Wavefront() { if (callArgMem) delete callArgMem; + delete condRegState; } void diff --git a/src/gpu-compute/wavefront.hh b/src/gpu-compute/wavefront.hh index 0abab8e83..5a5386a3d 100644 --- a/src/gpu-compute/wavefront.hh +++ b/src/gpu-compute/wavefront.hh @@ -83,6 +83,7 @@ class CallArgMem public: // pointer to buffer for storing function arguments uint8_t *mem; + int wfSize; // size of function args int funcArgsSizePerItem; @@ -90,13 +91,13 @@ class CallArgMem int getLaneOffset(int lane, int addr) { - return addr * VSZ + sizeof(CType) * lane; + return addr * wfSize + sizeof(CType) * lane; } - CallArgMem(int func_args_size_per_item) - : funcArgsSizePerItem(func_args_size_per_item) + CallArgMem(int func_args_size_per_item, int wf_size) + : wfSize(wf_size), funcArgsSizePerItem(func_args_size_per_item) { - mem = (uint8_t*)malloc(funcArgsSizePerItem * VSZ); + mem = (uint8_t*)malloc(funcArgsSizePerItem * wfSize); } ~CallArgMem() @@ -192,9 +193,9 @@ class Wavefront : public SimObject bool isOldestInstALU(); bool isOldestInstBarrier(); // used for passing spill address to DDInstGPU - uint64_t last_addr[VSZ]; - uint32_t workitemid[3][VSZ]; - uint32_t workitemFlatId[VSZ]; + std::vector<Addr> last_addr; + std::vector<uint32_t> workitemid[3]; + std::vector<uint32_t> workitemFlatId; uint32_t workgroupid[3]; uint32_t workgroupsz[3]; uint32_t gridsz[3]; @@ -230,14 +231,14 @@ class Wavefront : public SimObject uint32_t startVgprIndex; // Old value of destination gpr (for trace) - uint32_t old_vgpr[VSZ]; + std::vector<uint32_t> old_vgpr; // Id of destination gpr (for trace) uint32_t old_vgpr_id; // Tick count of last old_vgpr copy uint64_t old_vgpr_tcnt; // Old value of destination gpr (for trace) - uint64_t old_dgpr[VSZ]; + std::vector<uint64_t> old_dgpr; // Id of destination gpr (for trace) uint32_t old_dgpr_id; // Tick count of last old_vgpr copy @@ -247,7 +248,7 @@ class Wavefront : public SimObject VectorMask init_mask; // number of barriers this WF has joined - int bar_cnt[VSZ]; + std::vector<int> bar_cnt; int max_bar_cnt; // Flag to stall a wave on barrier bool stalledAtBarrier; @@ -296,9 +297,9 @@ class Wavefront : public SimObject // argument memory for hsail call instruction CallArgMem *callArgMem; void - initCallArgMem(int func_args_size_per_item) + initCallArgMem(int func_args_size_per_item, int wf_size) { - callArgMem = new CallArgMem(func_args_size_per_item); + callArgMem = new CallArgMem(func_args_size_per_item, wf_size); } template<typename CType> @@ -327,7 +328,6 @@ class Wavefront : public SimObject } void start(uint64_t _wfDynId, uint64_t _base_ptr); - void exec(); void updateResources(); int ready(itype_e type); |