summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--configs/example/apu_se.py3
-rwxr-xr-xsrc/arch/hsail/gen.py14
-rw-r--r--src/arch/hsail/insts/branch.hh2
-rw-r--r--src/arch/hsail/insts/main.cc2
-rw-r--r--src/arch/hsail/insts/mem.hh16
-rw-r--r--src/arch/hsail/insts/mem_impl.hh46
-rw-r--r--src/arch/hsail/insts/pseudo_inst.cc57
-rw-r--r--src/arch/hsail/operand.hh44
-rw-r--r--src/gpu-compute/GPU.py2
-rw-r--r--src/gpu-compute/cl_driver.cc2
-rw-r--r--src/gpu-compute/compute_unit.cc117
-rw-r--r--src/gpu-compute/compute_unit.hh18
-rw-r--r--src/gpu-compute/dispatcher.cc6
-rw-r--r--src/gpu-compute/dispatcher.hh1
-rw-r--r--src/gpu-compute/global_memory_pipeline.cc4
-rw-r--r--src/gpu-compute/gpu_dyn_inst.cc22
-rw-r--r--src/gpu-compute/gpu_dyn_inst.hh10
-rw-r--r--src/gpu-compute/local_memory_pipeline.cc4
-rw-r--r--src/gpu-compute/misc.hh18
-rw-r--r--src/gpu-compute/qstruct.hh2
-rw-r--r--src/gpu-compute/vector_register_file.cc2
-rw-r--r--src/gpu-compute/vector_register_state.cc15
-rw-r--r--src/gpu-compute/vector_register_state.hh6
-rw-r--r--src/gpu-compute/wavefront.cc10
-rw-r--r--src/gpu-compute/wavefront.hh26
25 files changed, 256 insertions, 193 deletions
diff --git a/configs/example/apu_se.py b/configs/example/apu_se.py
index 75819b505..27a26071b 100644
--- a/configs/example/apu_se.py
+++ b/configs/example/apu_se.py
@@ -250,7 +250,8 @@ for i in xrange(n_cu):
vrfs = []
for j in xrange(options.simds_per_cu):
for k in xrange(shader.n_wf):
- wavefronts.append(Wavefront(simdId = j, wf_slot_id = k))
+ wavefronts.append(Wavefront(simdId = j, wf_slot_id = k,
+ wfSize = options.wf_size))
vrfs.append(VectorRegisterFile(simd_id=j,
num_regs_per_simd=options.vreg_file_size))
compute_units[-1].wavefronts = wavefronts
diff --git a/src/arch/hsail/gen.py b/src/arch/hsail/gen.py
index bb369fd10..f77680541 100755
--- a/src/arch/hsail/gen.py
+++ b/src/arch/hsail/gen.py
@@ -235,7 +235,7 @@ $class_name::execute(GPUDynInstPtr gpuDynInst)
const VectorMask &mask = w->get_pred();
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (mask[lane]) {
DestCType dest_val = $expr;
this->dest.set(w, lane, dest_val);
@@ -256,7 +256,7 @@ $class_name::execute(GPUDynInstPtr gpuDynInst)
const VectorMask &mask = w->get_pred();
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (mask[lane]) {
SrcCType src_val0 = this->src0.get<SrcCType>(w, lane);
DestCType dest_val = $expr;
@@ -277,7 +277,7 @@ $class_name<DataType>::execute(GPUDynInstPtr gpuDynInst)
const VectorMask &mask = w->get_pred();
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (mask[lane]) {
CType dest_val;
if ($dest_is_src_flag) {
@@ -312,7 +312,7 @@ $class_name<DataType>::execute(GPUDynInstPtr gpuDynInst)
const VectorMask &mask = w->get_pred();
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (mask[lane]) {
CType dest_val;
@@ -346,7 +346,7 @@ $class_name<DataType>::execute(GPUDynInstPtr gpuDynInst)
const VectorMask &mask = w->get_pred();
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (mask[lane]) {
DestT dest_val;
if ($dest_is_src_flag) {
@@ -372,7 +372,7 @@ $class_name<DataType>::execute(GPUDynInstPtr gpuDynInst)
Wavefront *w = gpuDynInst->wavefront();
const VectorMask &mask = w->get_pred();
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (mask[lane]) {
CType dest_val;
@@ -401,7 +401,7 @@ $class_name<DestDataType, SrcDataType>::execute(GPUDynInstPtr gpuDynInst)
const VectorMask &mask = w->get_pred();
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (mask[lane]) {
DestCType dest_val;
SrcCType src_val[$num_srcs];
diff --git a/src/arch/hsail/insts/branch.hh b/src/arch/hsail/insts/branch.hh
index f4b00fc8d..45cd876ad 100644
--- a/src/arch/hsail/insts/branch.hh
+++ b/src/arch/hsail/insts/branch.hh
@@ -279,7 +279,7 @@ namespace HsailISA
// taken branch
const uint32_t true_pc = getTargetPc();
VectorMask true_mask;
- for (unsigned int lane = 0; lane < VSZ; ++lane) {
+ for (unsigned int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
true_mask[lane] = cond.get<bool>(w, lane) & curr_mask[lane];
}
diff --git a/src/arch/hsail/insts/main.cc b/src/arch/hsail/insts/main.cc
index 4e70bf46a..004054524 100644
--- a/src/arch/hsail/insts/main.cc
+++ b/src/arch/hsail/insts/main.cc
@@ -134,7 +134,7 @@ namespace HsailISA
const VectorMask &mask = w->get_pred();
// mask off completed work-items
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (mask[lane]) {
w->init_mask[lane] = 0;
}
diff --git a/src/arch/hsail/insts/mem.hh b/src/arch/hsail/insts/mem.hh
index f2792cd49..1db98d212 100644
--- a/src/arch/hsail/insts/mem.hh
+++ b/src/arch/hsail/insts/mem.hh
@@ -457,7 +457,7 @@ namespace HsailISA
gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
if (num_dest_operands > 1) {
- for (int i = 0; i < VSZ; ++i)
+ for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i)
if (gpuDynInst->exec_mask[i])
gpuDynInst->statusVector.push_back(num_dest_operands);
else
@@ -466,9 +466,10 @@ namespace HsailISA
for (int k = 0; k < num_dest_operands; ++k) {
- c0 *d = &((c0*)gpuDynInst->d_data)[k * VSZ];
+ c0 *d = &((c0*)gpuDynInst->d_data)
+ [k * gpuDynInst->computeUnit()->wfSize()];
- for (int i = 0; i < VSZ; ++i) {
+ for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i) {
if (gpuDynInst->exec_mask[i]) {
Addr vaddr = gpuDynInst->addr[i] + k * sizeof(c0);
@@ -1004,7 +1005,7 @@ namespace HsailISA
gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
if (num_src_operands > 1) {
- for (int i = 0; i < VSZ; ++i)
+ for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i)
if (gpuDynInst->exec_mask[i])
gpuDynInst->statusVector.push_back(num_src_operands);
else
@@ -1012,9 +1013,10 @@ namespace HsailISA
}
for (int k = 0; k < num_src_operands; ++k) {
- c0 *d = &((c0*)gpuDynInst->d_data)[k * VSZ];
+ c0 *d = &((c0*)gpuDynInst->d_data)
+ [k * gpuDynInst->computeUnit()->wfSize()];
- for (int i = 0; i < VSZ; ++i) {
+ for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i) {
if (gpuDynInst->exec_mask[i]) {
Addr vaddr = gpuDynInst->addr[i] + k * sizeof(c0);
@@ -1402,7 +1404,7 @@ namespace HsailISA
c0 *e = &((c0*) gpuDynInst->a_data)[0];
c0 *f = &((c0*) gpuDynInst->x_data)[0];
- for (int i = 0; i < VSZ; ++i) {
+ for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i) {
if (gpuDynInst->exec_mask[i]) {
Addr vaddr = gpuDynInst->addr[i];
diff --git a/src/arch/hsail/insts/mem_impl.hh b/src/arch/hsail/insts/mem_impl.hh
index 94f0cd6aa..8329c6e8a 100644
--- a/src/arch/hsail/insts/mem_impl.hh
+++ b/src/arch/hsail/insts/mem_impl.hh
@@ -60,14 +60,16 @@ namespace HsailISA
typedef typename DestDataType::CType CType M5_VAR_USED;
const VectorMask &mask = w->get_pred();
- uint64_t addr_vec[VSZ];
+ std::vector<Addr> addr_vec;
+ addr_vec.resize(w->computeUnit->wfSize(), (Addr)0);
this->addr.calcVector(w, addr_vec);
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (mask[lane]) {
this->dest.set(w, lane, addr_vec[lane]);
}
}
+ addr_vec.clear();
}
template<typename MemDataType, typename DestDataType,
@@ -121,8 +123,8 @@ namespace HsailISA
i->parent->findSymbol(Brig::BrigPrivateSpace, addr);
assert(se);
- return w->wfSlotId * w->privSizePerItem * VSZ +
- se->offset * VSZ +
+ return w->wfSlotId * w->privSizePerItem * w->computeUnit->wfSize() +
+ se->offset * w->computeUnit->wfSize() +
lane * se->size;
*/
@@ -139,9 +141,11 @@ namespace HsailISA
Addr addr_div8 = addr / 8;
Addr addr_mod8 = addr % 8;
- Addr ret = addr_div8 * 8 * VSZ + lane * 8 + addr_mod8 + w->privBase;
+ Addr ret = addr_div8 * 8 * w->computeUnit->wfSize() + lane * 8 +
+ addr_mod8 + w->privBase;
- assert(ret < w->privBase + (w->privSizePerItem * VSZ));
+ assert(ret < w->privBase +
+ (w->privSizePerItem * w->computeUnit->wfSize()));
return ret;
}
@@ -175,7 +179,7 @@ namespace HsailISA
DPRINTF(HSAIL, "ld_kernarg [%d] -> %d\n", address, val);
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (mask[lane]) {
this->dest.set(w, lane, val);
}
@@ -184,7 +188,7 @@ namespace HsailISA
return;
} else if (this->segment == Brig::BRIG_SEGMENT_ARG) {
uint64_t address = this->addr.calcUniform();
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (mask[lane]) {
MemCType val = w->readCallArgMem<MemCType>(lane, address);
@@ -239,7 +243,7 @@ namespace HsailISA
// this is a complete hack to get around a compiler bug
// (the compiler currently generates global access for private
// addresses (starting from 0). We need to add the private offset)
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (m->addr[lane] < w->privSizePerItem) {
if (mask[lane]) {
// what is the size of the object we are accessing?
@@ -267,7 +271,7 @@ namespace HsailISA
m->pipeId = GLBMEM_PIPE;
m->latency.set(w->computeUnit->shader->ticks(1));
{
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
// note: this calculation will NOT WORK if the compiler
// ever generates loads/stores to the same address with
// different widths (e.g., a ld_u32 addr and a ld_u16 addr)
@@ -301,7 +305,7 @@ namespace HsailISA
m->pipeId = GLBMEM_PIPE;
m->latency.set(w->computeUnit->shader->ticks(1));
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (mask[lane]) {
assert(m->addr[lane] + sizeof(MemCType) <= w->roSize);
m->addr[lane] += w->roBase;
@@ -318,7 +322,7 @@ namespace HsailISA
m->pipeId = GLBMEM_PIPE;
m->latency.set(w->computeUnit->shader->ticks(1));
{
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (mask[lane]) {
assert(m->addr[lane] < w->privSizePerItem);
@@ -360,7 +364,7 @@ namespace HsailISA
if (this->segment == Brig::BRIG_SEGMENT_ARG) {
uint64_t address = this->addr.calcUniform();
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (mask[lane]) {
CType data = this->src.template get<CType>(w, lane);
DPRINTF(HSAIL, "st_arg [%d] <- %d\n", address, data);
@@ -378,7 +382,7 @@ namespace HsailISA
this->addr.calcVector(w, m->addr);
if (num_src_operands == 1) {
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (mask[lane]) {
((CType*)m->d_data)[lane] =
this->src.template get<CType>(w, lane);
@@ -386,9 +390,9 @@ namespace HsailISA
}
} else {
for (int k= 0; k < num_src_operands; ++k) {
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (mask[lane]) {
- ((CType*)m->d_data)[k * VSZ + lane] =
+ ((CType*)m->d_data)[k * w->computeUnit->wfSize() + lane] =
this->src_vect[k].template get<CType>(w, lane);
}
}
@@ -428,7 +432,7 @@ namespace HsailISA
// this is a complete hack to get around a compiler bug
// (the compiler currently generates global access for private
// addresses (starting from 0). We need to add the private offset)
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (mask[lane]) {
if (m->addr[lane] < w->privSizePerItem) {
@@ -454,7 +458,7 @@ namespace HsailISA
m->pipeId = GLBMEM_PIPE;
m->latency.set(w->computeUnit->shader->ticks(1));
{
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (mask[lane]) {
assert(m->addr[lane] < w->spillSizePerItem);
@@ -483,7 +487,7 @@ namespace HsailISA
m->pipeId = GLBMEM_PIPE;
m->latency.set(w->computeUnit->shader->ticks(1));
{
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (mask[lane]) {
assert(m->addr[lane] < w->privSizePerItem);
m->addr[lane] = m->addr[lane] + lane *
@@ -558,14 +562,14 @@ namespace HsailISA
this->addr.calcVector(w, m->addr);
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
((CType *)m->a_data)[lane] =
this->src[0].template get<CType>(w, lane);
}
// load second source operand for CAS
if (NumSrcOperands > 1) {
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
((CType*)m->x_data)[lane] =
this->src[1].template get<CType>(w, lane);
}
diff --git a/src/arch/hsail/insts/pseudo_inst.cc b/src/arch/hsail/insts/pseudo_inst.cc
index 9506a80ab..56ca8047c 100644
--- a/src/arch/hsail/insts/pseudo_inst.cc
+++ b/src/arch/hsail/insts/pseudo_inst.cc
@@ -84,7 +84,7 @@ namespace HsailISA
int op = 0;
bool got_op = false;
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (mask[lane]) {
int src_val0 = src1.get<int>(w, lane, 0);
if (got_op) {
@@ -182,7 +182,7 @@ namespace HsailISA
{
#if TRACING_ON
const VectorMask &mask = w->get_pred();
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (mask[lane]) {
int src_val1 = src1.get<int>(w, lane, 1);
int src_val2 = src1.get<int>(w, lane, 2);
@@ -205,7 +205,7 @@ namespace HsailISA
{
#if TRACING_ON
const VectorMask &mask = w->get_pred();
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (mask[lane]) {
int64_t src_val1 = src1.get<int64_t>(w, lane, 1);
int src_val2 = src1.get<int>(w, lane, 2);
@@ -231,7 +231,7 @@ namespace HsailISA
std::string res_str;
res_str = csprintf("krl_prt (%s)\n", disassemble());
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (!(lane & 7)) {
res_str += csprintf("DB%03d: ", (int)w->wfDynId);
}
@@ -270,7 +270,7 @@ namespace HsailISA
int src_val3 = -1;
res_str = csprintf("krl_prt (%s)\n", disassemble());
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (!(lane & 7)) {
res_str += csprintf("DB%03d: ", (int)w->wfDynId);
}
@@ -311,7 +311,7 @@ namespace HsailISA
std::string res_str;
res_str = csprintf("krl_prt (%s)\n", disassemble());
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (!(lane & 3)) {
res_str += csprintf("DB%03d: ", (int)w->wfDynId);
}
@@ -350,7 +350,7 @@ namespace HsailISA
int src_val3 = -1;
res_str = csprintf("krl_prt (%s)\n", disassemble());
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (!(lane & 3)) {
res_str += csprintf("DB%03d: ", (int)w->wfDynId);
}
@@ -391,7 +391,7 @@ namespace HsailISA
std::string res_str;
res_str = csprintf("krl_prt (%s)\n", disassemble());
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (!(lane & 7)) {
res_str += csprintf("DB%03d: ", (int)w->wfDynId);
}
@@ -430,7 +430,7 @@ namespace HsailISA
res_str += csprintf(" Executing on CU #%i\n", w->computeUnit->cu_id);
res_str += csprintf(" Exec mask: ");
- for (int i = VSZ - 1; i >= 0; --i) {
+ for (int i = w->computeUnit->wfSize() - 1; i >= 0; --i) {
if (w->execMask(i))
res_str += "1";
else
@@ -458,7 +458,7 @@ namespace HsailISA
const VectorMask &mask = w->get_pred();
int res = 0;
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (mask[lane]) {
int src_val1 = src1.get<int>(w, lane, 1);
dest.set<int>(w, lane, res);
@@ -477,14 +477,14 @@ namespace HsailISA
const VectorMask &mask = w->get_pred();
int res = 0;
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (mask[lane]) {
int src_val1 = src1.get<int>(w, lane, 1);
res += src_val1;
}
}
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (mask[lane]) {
dest.set<int>(w, lane, res);
}
@@ -497,19 +497,19 @@ namespace HsailISA
const VectorMask &mask = w->get_pred();
int res = 0;
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (mask[lane]) {
int src_val1 = src1.get<int>(w, lane, 1);
if (src_val1) {
- if (lane < (VSZ/2)) {
+ if (lane < (w->computeUnit->wfSize()/2)) {
res = res | ((uint32_t)(1) << lane);
}
}
}
}
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (mask[lane]) {
dest.set<int>(w, lane, res);
}
@@ -521,19 +521,20 @@ namespace HsailISA
{
const VectorMask &mask = w->get_pred();
int res = 0;
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (mask[lane]) {
int src_val1 = src1.get<int>(w, lane, 1);
if (src_val1) {
- if (lane >= (VSZ/2)) {
- res = res | ((uint32_t)(1) << (lane - (VSZ/2)));
+ if (lane >= (w->computeUnit->wfSize()/2)) {
+ res = res | ((uint32_t)(1) <<
+ (lane - (w->computeUnit->wfSize()/2)));
}
}
}
}
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (mask[lane]) {
dest.set<int>(w, lane, res);
}
@@ -546,7 +547,7 @@ namespace HsailISA
const VectorMask &mask = w->get_pred();
int max_cnt = 0;
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (mask[lane]) {
w->bar_cnt[lane]++;
@@ -567,7 +568,7 @@ namespace HsailISA
const VectorMask &mask = w->get_pred();
int max_cnt = 0;
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (mask[lane]) {
w->bar_cnt[lane]--;
}
@@ -592,7 +593,7 @@ namespace HsailISA
{
const VectorMask &mask = w->get_pred();
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (mask[lane]) {
int src_val1 = src1.get<int>(w, lane, 1);
panic("OpenCL Code failed assertion #%d. Triggered by lane %s",
@@ -605,7 +606,7 @@ namespace HsailISA
Call::calcAddr(Wavefront *w, GPUDynInstPtr m)
{
// the address is in src1 | src2
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
int src_val1 = src1.get<int>(w, lane, 1);
int src_val2 = src1.get<int>(w, lane, 2);
Addr addr = (((Addr) src_val1) << 32) | ((Addr) src_val2);
@@ -622,7 +623,7 @@ namespace HsailISA
calcAddr(w, m);
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
((int*)m->a_data)[lane] = src1.get<int>(w, lane, 3);
}
@@ -661,7 +662,7 @@ namespace HsailISA
GPUDynInstPtr m = gpuDynInst;
calcAddr(w, m);
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
((int*)m->a_data)[lane] = src1.get<int>(w, lane, 1);
}
@@ -736,7 +737,7 @@ namespace HsailISA
const VectorMask &mask = w->get_pred();
int src_val1 = 0;
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (mask[lane]) {
src_val1 = src1.get<int>(w, lane, 1);
break;
@@ -758,7 +759,7 @@ namespace HsailISA
const VectorMask &mask = w->get_pred();
unsigned mst = true;
- for (int lane = VSZ - 1; lane >= 0; --lane) {
+ for (int lane = w->computeUnit->wfSize() - 1; lane >= 0; --lane) {
if (mask[lane]) {
dest.set<int>(w, lane, mst);
mst = false;
@@ -773,7 +774,7 @@ namespace HsailISA
int res = 0;
bool got_res = false;
- for (int lane = VSZ - 1; lane >= 0; --lane) {
+ for (int lane = w->computeUnit->wfSize() - 1; lane >= 0; --lane) {
if (mask[lane]) {
if (!got_res) {
res = src1.get<int>(w, lane, 1);
diff --git a/src/arch/hsail/operand.hh b/src/arch/hsail/operand.hh
index e3d275b10..4d981ee00 100644
--- a/src/arch/hsail/operand.hh
+++ b/src/arch/hsail/operand.hh
@@ -42,6 +42,7 @@
* Defines classes encapsulating HSAIL instruction operands.
*/
+#include <limits>
#include <string>
#include "arch/hsail/Brig.h"
@@ -346,6 +347,8 @@ class CRegOperand : public BaseRegOperand
template<typename T>
class ImmOperand : public BaseOperand
{
+ private:
+ uint16_t kind;
public:
T bits;
@@ -355,11 +358,21 @@ class ImmOperand : public BaseOperand
template<typename OperandType>
OperandType
- get()
+ get(Wavefront *w)
{
assert(sizeof(OperandType) <= sizeof(T));
+ panic_if(w == nullptr, "WF pointer needs to be set");
+
+ switch (kind) {
+ // immediate operand is WF size
+ case Brig::BRIG_KIND_OPERAND_WAVESIZE:
+ return (OperandType)w->computeUnit->wfSize();
+ break;
- return *(OperandType*)&bits;
+ default:
+ return *(OperandType*)&bits;
+ break;
+ }
}
// This version of get() takes a WF* and a lane id for
@@ -368,7 +381,7 @@ class ImmOperand : public BaseOperand
OperandType
get(Wavefront *w, int lane)
{
- return get<OperandType>();
+ return get<OperandType>(w);
}
};
@@ -388,16 +401,18 @@ ImmOperand<T>::init(unsigned opOffset, const BrigObject *obj)
auto cbptr = (Brig::BrigOperandConstantBytes*)brigOp;
bits = *((T*)(obj->getData(cbptr->bytes + 4)));
-
+ kind = brigOp->kind;
return true;
}
break;
case Brig::BRIG_KIND_OPERAND_WAVESIZE:
- bits = VSZ;
+ kind = brigOp->kind;
+ bits = std::numeric_limits<unsigned long long>::digits;
return true;
default:
+ kind = Brig::BRIG_KIND_NONE;
return false;
}
}
@@ -409,6 +424,7 @@ ImmOperand<T>::init_from_vect(unsigned opOffset, const BrigObject *obj, int at)
const Brig::BrigOperand *brigOp = obj->getOperand(opOffset);
if (brigOp->kind != Brig::BRIG_KIND_OPERAND_OPERAND_LIST) {
+ kind = Brig::BRIG_KIND_NONE;
return false;
}
@@ -423,6 +439,7 @@ ImmOperand<T>::init_from_vect(unsigned opOffset, const BrigObject *obj, int at)
(const Brig::BrigOperand *)obj->getOperand(*data_offset);
if (p->kind != Brig::BRIG_KIND_OPERAND_CONSTANT_BYTES) {
+ kind = Brig::BRIG_KIND_NONE;
return false;
}
@@ -456,7 +473,7 @@ class RegOrImmOperand : public BaseOperand
OperandType
get(Wavefront *w, int lane)
{
- return is_imm ? imm_op.template get<OperandType>() :
+ return is_imm ? imm_op.template get<OperandType>(w) :
reg_op.template get<OperandType>(w, lane);
}
@@ -571,7 +588,7 @@ class AddrOperandBase : public BaseOperand
uint64_t calcUniformBase();
public:
- virtual void calcVector(Wavefront *w, uint64_t *addrVec) = 0;
+ virtual void calcVector(Wavefront *w, std::vector<Addr> &addrVec) = 0;
virtual uint64_t calcLane(Wavefront *w, int lane=0) = 0;
uint64_t offset;
@@ -586,7 +603,7 @@ class RegAddrOperand : public AddrOperandBase
RegOperandType reg;
void init(unsigned opOffset, const BrigObject *obj);
uint64_t calcUniform();
- void calcVector(Wavefront *w, uint64_t *addrVec);
+ void calcVector(Wavefront *w, std::vector<Addr> &addrVec);
uint64_t calcLane(Wavefront *w, int lane=0);
uint32_t opSize() { return reg.opSize(); }
bool isVectorRegister() { return reg.registerType == Enums::RT_VECTOR; }
@@ -641,11 +658,12 @@ RegAddrOperand<RegOperandType>::calcUniform()
template<typename RegOperandType>
void
-RegAddrOperand<RegOperandType>::calcVector(Wavefront *w, uint64_t *addrVec)
+RegAddrOperand<RegOperandType>::calcVector(Wavefront *w,
+ std::vector<Addr> &addrVec)
{
Addr address = calcUniformBase();
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (w->execMask(lane)) {
if (reg.regFileChar == 's') {
addrVec[lane] = address + reg.template get<uint32_t>(w, lane);
@@ -680,7 +698,7 @@ class NoRegAddrOperand : public AddrOperandBase
public:
void init(unsigned opOffset, const BrigObject *obj);
uint64_t calcUniform();
- void calcVector(Wavefront *w, uint64_t *addrVec);
+ void calcVector(Wavefront *w, std::vector<Addr> &addrVec);
uint64_t calcLane(Wavefront *w, int lane=0);
std::string disassemble();
};
@@ -698,11 +716,11 @@ NoRegAddrOperand::calcLane(Wavefront *w, int lane)
}
inline void
-NoRegAddrOperand::calcVector(Wavefront *w, uint64_t *addrVec)
+NoRegAddrOperand::calcVector(Wavefront *w, std::vector<Addr> &addrVec)
{
uint64_t address = calcUniformBase();
- for (int lane = 0; lane < VSZ; ++lane)
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane)
addrVec[lane] = address;
}
diff --git a/src/gpu-compute/GPU.py b/src/gpu-compute/GPU.py
index bd95f6335..f580a09f7 100644
--- a/src/gpu-compute/GPU.py
+++ b/src/gpu-compute/GPU.py
@@ -59,6 +59,7 @@ class VectorRegisterFile(SimObject):
simd_id = Param.Int(0, 'SIMD ID associated with this VRF')
num_regs_per_simd = Param.Int(2048, 'number of vector registers per SIMD')
+ wfSize = Param.Int(64, 'Wavefront size (in work items)')
min_alloc = Param.Int(4, 'min number of VGPRs allocated per WF')
class Wavefront(SimObject):
@@ -68,6 +69,7 @@ class Wavefront(SimObject):
simdId = Param.Int('SIMD id (0-ComputeUnit.num_SIMDs)')
wf_slot_id = Param.Int('wavefront id (0-ComputeUnit.max_wfs)')
+ wfSize = Param.Int(64, 'Wavefront size (in work items)')
class ComputeUnit(MemObject):
type = 'ComputeUnit'
diff --git a/src/gpu-compute/cl_driver.cc b/src/gpu-compute/cl_driver.cc
index 3b3291c03..6bb6be102 100644
--- a/src/gpu-compute/cl_driver.cc
+++ b/src/gpu-compute/cl_driver.cc
@@ -238,7 +238,7 @@ ClDriver::ioctl(LiveProcess *process, ThreadContext *tc, unsigned req)
case HSA_GET_VSZ:
{
BufferArg buf(buf_addr, sizeof(uint32_t));
- *((uint32_t*)buf.bufferPtr()) = VSZ;
+ *((uint32_t*)buf.bufferPtr()) = dispatcher->wfSize();
buf.copyOut(tc->getMemProxy());
}
break;
diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc
index b3a99b182..5ec061172 100644
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -32,9 +32,10 @@
*
* Author: John Kalamatianos, Anthony Gutierrez
*/
-
#include "gpu-compute/compute_unit.hh"
+#include <limits>
+
#include "base/output.hh"
#include "debug/GPUDisp.hh"
#include "debug/GPUExec.hh"
@@ -76,14 +77,27 @@ ComputeUnit::ComputeUnit(const Params *p) : MemObject(p), fetchStage(p),
_masterId(p->system->getMasterId(name() + ".ComputeUnit")),
lds(*p->localDataStore), globalSeqNum(0), wavefrontSize(p->wfSize)
{
- // this check will be eliminated once we have wavefront size support added
- fatal_if(p->wfSize != VSZ, "Wavefront size parameter does not match VSZ");
+ /**
+ * This check is necessary because std::bitset only provides conversion
+ * to unsigned long or unsigned long long via to_ulong() or to_ullong().
+ * there are * a few places in the code where to_ullong() is used, however
+ * if VSZ is larger than a value the host can support then bitset will
+ * throw a runtime exception. we should remove all use of to_long() or
+ * to_ullong() so we can have VSZ greater than 64b, however until that is
+ * done this assert is required.
+ */
+ fatal_if(p->wfSize > std::numeric_limits<unsigned long long>::digits ||
+ p->wfSize <= 0,
+ "WF size is larger than the host can support");
+ fatal_if(!isPowerOf2(wavefrontSize),
+ "Wavefront size should be a power of 2");
// calculate how many cycles a vector load or store will need to transfer
// its data over the corresponding buses
- numCyclesPerStoreTransfer = (uint32_t)ceil((double)(VSZ * sizeof(uint32_t))
- / (double)vrfToCoalescerBusWidth);
+ numCyclesPerStoreTransfer =
+ (uint32_t)ceil((double)(wfSize() * sizeof(uint32_t)) /
+ (double)vrfToCoalescerBusWidth);
- numCyclesPerLoadTransfer = (VSZ * sizeof(uint32_t))
+ numCyclesPerLoadTransfer = (wfSize() * sizeof(uint32_t))
/ coalescerToVrfBusWidth;
lastVaddrWF.resize(numSIMDs);
@@ -93,24 +107,24 @@ ComputeUnit::ComputeUnit(const Params *p) : MemObject(p), fetchStage(p),
lastVaddrWF[j].resize(p->n_wf);
for (int i = 0; i < p->n_wf; ++i) {
- lastVaddrWF[j][i].resize(VSZ);
+ lastVaddrWF[j][i].resize(wfSize());
wfList[j].push_back(p->wavefronts[j * p->n_wf + i]);
wfList[j][i]->setParent(this);
- for (int k = 0; k < VSZ; ++k) {
+ for (int k = 0; k < wfSize(); ++k) {
lastVaddrWF[j][i][k] = 0;
}
}
}
- lastVaddrPhase.resize(numSIMDs);
+ lastVaddrSimd.resize(numSIMDs);
for (int i = 0; i < numSIMDs; ++i) {
- lastVaddrPhase[i] = LastVaddrWave();
+ lastVaddrSimd[i].resize(wfSize(), 0);
}
- lastVaddrCU = LastVaddrWave();
+ lastVaddrCU.resize(wfSize());
lds.setParent(this);
@@ -122,10 +136,10 @@ ComputeUnit::ComputeUnit(const Params *p) : MemObject(p), fetchStage(p),
fatal("Invalid WF execution policy (CU)\n");
}
- memPort.resize(VSZ);
+ memPort.resize(wfSize());
// resize the tlbPort vectorArray
- int tlbPort_width = perLaneTLB ? VSZ : 1;
+ int tlbPort_width = perLaneTLB ? wfSize() : 1;
tlbPort.resize(tlbPort_width);
cuExitCallback = new CUExitCallback(this);
@@ -144,12 +158,13 @@ ComputeUnit::ComputeUnit(const Params *p) : MemObject(p), fetchStage(p),
ComputeUnit::~ComputeUnit()
{
// Delete wavefront slots
-
- for (int j = 0; j < numSIMDs; ++j)
+ for (int j = 0; j < numSIMDs; ++j) {
for (int i = 0; i < shader->n_wf; ++i) {
delete wfList[j][i];
}
-
+ lastVaddrSimd[j].clear();
+ }
+ lastVaddrCU.clear();
readyList.clear();
waveStatusList.clear();
dispatchList.clear();
@@ -187,27 +202,25 @@ ComputeUnit::InitializeWFContext(WFContext *wfCtx, NDRange *ndr, int cnt,
VectorMask init_mask;
init_mask.reset();
- for (int k = 0; k < VSZ; ++k) {
- if (k + cnt * VSZ < trueWgSizeTotal)
+ for (int k = 0; k < wfSize(); ++k) {
+ if (k + cnt * wfSize() < trueWgSizeTotal)
init_mask[k] = 1;
}
wfCtx->init_mask = init_mask.to_ullong();
wfCtx->exec_mask = init_mask.to_ullong();
- for (int i = 0; i < VSZ; ++i) {
- wfCtx->bar_cnt[i] = 0;
- }
+ wfCtx->bar_cnt.resize(wfSize(), 0);
wfCtx->max_bar_cnt = 0;
wfCtx->old_barrier_cnt = 0;
wfCtx->barrier_cnt = 0;
wfCtx->privBase = ndr->q.privMemStart;
- ndr->q.privMemStart += ndr->q.privMemPerItem * VSZ;
+ ndr->q.privMemStart += ndr->q.privMemPerItem * wfSize();
wfCtx->spillBase = ndr->q.spillMemStart;
- ndr->q.spillMemStart += ndr->q.spillMemPerItem * VSZ;
+ ndr->q.spillMemStart += ndr->q.spillMemPerItem * wfSize();
wfCtx->pc = 0;
wfCtx->rpc = UINT32_MAX;
@@ -265,10 +278,12 @@ ComputeUnit::StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[],
w->dynwaveid = cnt;
w->init_mask = wfCtx->init_mask;
- for (int k = 0; k < VSZ; ++k) {
- w->workitemid[0][k] = (k+cnt*VSZ) % trueWgSize[0];
- w->workitemid[1][k] = ((k + cnt * VSZ) / trueWgSize[0]) % trueWgSize[1];
- w->workitemid[2][k] = (k + cnt * VSZ) / (trueWgSize[0] * trueWgSize[1]);
+ for (int k = 0; k < wfSize(); ++k) {
+ w->workitemid[0][k] = (k+cnt*wfSize()) % trueWgSize[0];
+ w->workitemid[1][k] =
+ ((k + cnt * wfSize()) / trueWgSize[0]) % trueWgSize[1];
+ w->workitemid[2][k] =
+ (k + cnt * wfSize()) / (trueWgSize[0] * trueWgSize[1]);
w->workitemFlatId[k] = w->workitemid[2][k] * trueWgSize[0] *
trueWgSize[1] + w->workitemid[1][k] * trueWgSize[0] +
@@ -277,9 +292,9 @@ ComputeUnit::StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[],
w->old_barrier_cnt = wfCtx->old_barrier_cnt;
w->barrier_cnt = wfCtx->barrier_cnt;
- w->barrier_slots = divCeil(trueWgSizeTotal, VSZ);
+ w->barrier_slots = divCeil(trueWgSizeTotal, wfSize());
- for (int i = 0; i < VSZ; ++i) {
+ for (int i = 0; i < wfSize(); ++i) {
w->bar_cnt[i] = wfCtx->bar_cnt[i];
}
@@ -315,16 +330,17 @@ ComputeUnit::StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[],
// is this the last wavefront in the workgroup
// if set the spillWidth to be the remaining work-items
// so that the vector access is correct
- if ((cnt + 1) * VSZ >= trueWgSizeTotal) {
- w->spillWidth = trueWgSizeTotal - (cnt * VSZ);
+ if ((cnt + 1) * wfSize() >= trueWgSizeTotal) {
+ w->spillWidth = trueWgSizeTotal - (cnt * wfSize());
} else {
- w->spillWidth = VSZ;
+ w->spillWidth = wfSize();
}
DPRINTF(GPUDisp, "Scheduling wfDynId/barrier_id %d/%d on CU%d: "
"WF[%d][%d]\n", _n_wave, barrier_id, cu_id, w->simdId, w->wfSlotId);
w->start(++_n_wave, ndr->q.code_ptr);
+ wfCtx->bar_cnt.clear();
}
void
@@ -339,7 +355,7 @@ ComputeUnit::StartWorkgroup(NDRange *ndr)
// Send L1 cache acquire
// isKernel + isAcquire = Kernel Begin
if (shader->impl_kern_boundary_sync) {
- GPUDynInstPtr gpuDynInst = std::make_shared<GPUDynInst>(nullptr,
+ GPUDynInstPtr gpuDynInst = std::make_shared<GPUDynInst>(this,
nullptr,
nullptr, 0);
@@ -374,7 +390,7 @@ ComputeUnit::StartWorkgroup(NDRange *ndr)
if (w->status == Wavefront::S_STOPPED) {
// if we have scheduled all work items then stop
// scheduling wavefronts
- if (cnt * VSZ >= trueWgSizeTotal)
+ if (cnt * wfSize() >= trueWgSizeTotal)
break;
// reserve vector registers for the scheduled wavefront
@@ -420,7 +436,7 @@ ComputeUnit::ReadyWorkgroup(NDRange *ndr)
// work item of the work group
int vregDemandPerWI = ndr->q.sRegCount + (2 * ndr->q.dRegCount);
bool vregAvail = true;
- int numWfs = (trueWgSizeTotal + VSZ - 1) / VSZ;
+ int numWfs = (trueWgSizeTotal + wfSize() - 1) / wfSize();
int freeWfSlots = 0;
// check if the total number of VGPRs required by all WFs of the WG
// fit in the VRFs of all SIMD units
@@ -623,7 +639,7 @@ ComputeUnit::init()
// Setup space for call args
for (int j = 0; j < numSIMDs; ++j) {
for (int i = 0; i < shader->n_wf; ++i) {
- wfList[j][i]->initCallArgMem(shader->funcargs_size);
+ wfList[j][i]->initCallArgMem(shader->funcargs_size, wavefrontSize);
}
}
@@ -1193,15 +1209,15 @@ ComputeUnit::DTLBPort::recvTimingResp(PacketPtr pkt)
Addr last = 0;
switch(computeUnit->prefetchType) {
- case Enums::PF_CU:
+ case Enums::PF_CU:
last = computeUnit->lastVaddrCU[mp_index];
break;
- case Enums::PF_PHASE:
- last = computeUnit->lastVaddrPhase[simdId][mp_index];
+ case Enums::PF_PHASE:
+ last = computeUnit->lastVaddrSimd[simdId][mp_index];
break;
- case Enums::PF_WF:
+ case Enums::PF_WF:
last = computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index];
- default:
+ default:
break;
}
@@ -1215,7 +1231,7 @@ ComputeUnit::DTLBPort::recvTimingResp(PacketPtr pkt)
DPRINTF(GPUPrefetch, "Stride is %d\n", stride);
computeUnit->lastVaddrCU[mp_index] = vaddr;
- computeUnit->lastVaddrPhase[simdId][mp_index] = vaddr;
+ computeUnit->lastVaddrSimd[simdId][mp_index] = vaddr;
computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index] = vaddr;
stride = (computeUnit->prefetchType == Enums::PF_STRIDE) ?
@@ -1488,7 +1504,7 @@ ComputeUnit::regStats()
;
ldsBankConflictDist
- .init(0, VSZ, 2)
+ .init(0, wfSize(), 2)
.name(name() + ".lds_bank_conflicts")
.desc("Number of bank conflicts per LDS memory packet")
;
@@ -1499,27 +1515,28 @@ ComputeUnit::regStats()
;
pageDivergenceDist
- // A wavefront can touch 1 to VSZ pages per memory instruction.
- // The number of pages per bin can be configured (here it's 4).
- .init(1, VSZ, 4)
+ // A wavefront can touch up to N pages per memory instruction where
+ // N is equal to the wavefront size
+ // The number of pages per bin can be configured (here it's 4).
+ .init(1, wfSize(), 4)
.name(name() + ".page_divergence_dist")
.desc("pages touched per wf (over all mem. instr.)")
;
controlFlowDivergenceDist
- .init(1, VSZ, 4)
+ .init(1, wfSize(), 4)
.name(name() + ".warp_execution_dist")
.desc("number of lanes active per instruction (oval all instructions)")
;
activeLanesPerGMemInstrDist
- .init(1, VSZ, 4)
+ .init(1, wfSize(), 4)
.name(name() + ".gmem_lanes_execution_dist")
.desc("number of active lanes per global memory instruction")
;
activeLanesPerLMemInstrDist
- .init(1, VSZ, 4)
+ .init(1, wfSize(), 4)
.name(name() + ".lmem_lanes_execution_dist")
.desc("number of active lanes per local memory instruction")
;
@@ -1531,7 +1548,7 @@ ComputeUnit::regStats()
numVecOpsExecuted
.name(name() + ".num_vec_ops_executed")
- .desc("number of vec ops executed (e.g. VSZ/inst)")
+ .desc("number of vec ops executed (e.g. WF size/inst)")
;
totalCycles
diff --git a/src/gpu-compute/compute_unit.hh b/src/gpu-compute/compute_unit.hh
index f47c27a0a..a234cbeb5 100644
--- a/src/gpu-compute/compute_unit.hh
+++ b/src/gpu-compute/compute_unit.hh
@@ -161,22 +161,8 @@ class ComputeUnit : public MemObject
// if fixed-stride prefetching, this is the stride.
int prefetchStride;
- class LastVaddrWave
- {
- public:
- Addr vaddrs[VSZ];
- Addr& operator[](int idx) {
- return vaddrs[idx];
- }
-
- LastVaddrWave() {
- for (int i = 0; i < VSZ; ++i)
- vaddrs[i] = 0;
- }
- };
-
- LastVaddrWave lastVaddrCU;
- std::vector<LastVaddrWave> lastVaddrPhase;
+ std::vector<Addr> lastVaddrCU;
+ std::vector<std::vector<Addr>> lastVaddrSimd;
std::vector<std::vector<std::vector<Addr>>> lastVaddrWF;
Enums::PrefetchType prefetchType;
EXEC_POLICY exec_policy;
diff --git a/src/gpu-compute/dispatcher.cc b/src/gpu-compute/dispatcher.cc
index 95c0c56a2..d1d011c0d 100644
--- a/src/gpu-compute/dispatcher.cc
+++ b/src/gpu-compute/dispatcher.cc
@@ -387,6 +387,12 @@ GpuDispatcher::getNumCUs()
return shader->cuList.size();
}
+int
+GpuDispatcher::wfSize() const
+{
+ return shader->cuList[0]->wfSize();
+}
+
void
GpuDispatcher::setFuncargsSize(int funcargs_size)
{
diff --git a/src/gpu-compute/dispatcher.hh b/src/gpu-compute/dispatcher.hh
index 76f932655..e984af494 100644
--- a/src/gpu-compute/dispatcher.hh
+++ b/src/gpu-compute/dispatcher.hh
@@ -157,6 +157,7 @@ class GpuDispatcher : public DmaDevice
// helper functions to retrieve/set GPU attributes
int getNumCUs();
+ int wfSize() const;
void setFuncargsSize(int funcargs_size);
};
diff --git a/src/gpu-compute/global_memory_pipeline.cc b/src/gpu-compute/global_memory_pipeline.cc
index 355018666..a6a4d86db 100644
--- a/src/gpu-compute/global_memory_pipeline.cc
+++ b/src/gpu-compute/global_memory_pipeline.cc
@@ -179,9 +179,9 @@ GlobalMemPipeline::doGmReturn(GPUDynInstPtr m)
int physVgpr = w->remap(dst, sizeof(c0), 1);
// save the physical VGPR index
regVec.push_back(physVgpr);
- c1 *p1 = &((c1*)m->d_data)[k * VSZ];
+ c1 *p1 = &((c1 *)m->d_data)[k * w->computeUnit->wfSize()];
- for (int i = 0; i < VSZ; ++i) {
+ for (int i = 0; i < w->computeUnit->wfSize(); ++i) {
if (m->exec_mask[i]) {
DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: "
"$%s%d <- %d global ld done (src = wavefront "
diff --git a/src/gpu-compute/gpu_dyn_inst.cc b/src/gpu-compute/gpu_dyn_inst.cc
index 2f35a983c..1806e79e4 100644
--- a/src/gpu-compute/gpu_dyn_inst.cc
+++ b/src/gpu-compute/gpu_dyn_inst.cc
@@ -42,11 +42,29 @@
GPUDynInst::GPUDynInst(ComputeUnit *_cu, Wavefront *_wf,
GPUStaticInst *_staticInst, uint64_t instSeqNum)
- : GPUExecContext(_cu, _wf), m_op(Enums::MO_UNDEF),
+ : GPUExecContext(_cu, _wf), addr(computeUnit()->wfSize(), (Addr)0),
+ m_op(Enums::MO_UNDEF),
memoryOrder(Enums::MEMORY_ORDER_NONE), n_reg(0), useContinuation(false),
statusBitVector(0), staticInst(_staticInst), _seqNum(instSeqNum)
{
- tlbHitLevel.assign(VSZ, -1);
+ tlbHitLevel.assign(computeUnit()->wfSize(), -1);
+ d_data = new uint8_t[computeUnit()->wfSize() * 16];
+ a_data = new uint8_t[computeUnit()->wfSize() * 8];
+ x_data = new uint8_t[computeUnit()->wfSize() * 8];
+ for (int i = 0; i < (computeUnit()->wfSize() * 8); ++i) {
+ a_data[i] = 0;
+ x_data[i] = 0;
+ }
+ for (int i = 0; i < (computeUnit()->wfSize() * 16); ++i) {
+ d_data[i] = 0;
+ }
+}
+
+GPUDynInst::~GPUDynInst()
+{
+ delete[] d_data;
+ delete[] a_data;
+ delete[] x_data;
}
void
diff --git a/src/gpu-compute/gpu_dyn_inst.hh b/src/gpu-compute/gpu_dyn_inst.hh
index e44d8f80d..46774d867 100644
--- a/src/gpu-compute/gpu_dyn_inst.hh
+++ b/src/gpu-compute/gpu_dyn_inst.hh
@@ -205,7 +205,7 @@ class GPUDynInst : public GPUExecContext
public:
GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *_staticInst,
uint64_t instSeqNum);
-
+ ~GPUDynInst();
void execute();
int numSrcRegOperands();
int numDstRegOperands();
@@ -226,15 +226,15 @@ class GPUDynInst : public GPUExecContext
Enums::StorageClassType executedAs();
// The address of the memory operation
- Addr addr[VSZ];
+ std::vector<Addr> addr;
Addr pAddr;
// The data to get written
- uint8_t d_data[VSZ * 16];
+ uint8_t *d_data;
// Additional data (for atomics)
- uint8_t a_data[VSZ * 8];
+ uint8_t *a_data;
// Additional data (for atomics)
- uint8_t x_data[VSZ * 8];
+ uint8_t *x_data;
// The execution mask
VectorMask exec_mask;
diff --git a/src/gpu-compute/local_memory_pipeline.cc b/src/gpu-compute/local_memory_pipeline.cc
index 7f919c5f4..a970d8f9b 100644
--- a/src/gpu-compute/local_memory_pipeline.cc
+++ b/src/gpu-compute/local_memory_pipeline.cc
@@ -148,9 +148,9 @@ LocalMemPipeline::doSmReturn(GPUDynInstPtr m)
int physVgpr = w->remap(dst,sizeof(c0),1);
// save the physical VGPR index
regVec.push_back(physVgpr);
- c1 *p1 = &((c1*)m->d_data)[k * VSZ];
+ c1 *p1 = &((c1 *)m->d_data)[k * w->computeUnit->wfSize()];
- for (int i = 0; i < VSZ; ++i) {
+ for (int i = 0; i < w->computeUnit->wfSize(); ++i) {
if (m->exec_mask[i]) {
// write the value into the physical VGPR. This is a purely
// functional operation. No timing is modeled.
diff --git a/src/gpu-compute/misc.hh b/src/gpu-compute/misc.hh
index 4f8032832..5ade89789 100644
--- a/src/gpu-compute/misc.hh
+++ b/src/gpu-compute/misc.hh
@@ -37,28 +37,14 @@
#define __MISC_HH__
#include <bitset>
+#include <limits>
#include <memory>
#include "base/misc.hh"
class GPUDynInst;
-// wavefront size of the machine
-static const int VSZ = 64;
-
-/*
- This check is necessary because std::bitset only provides conversion to
- unsigned long or unsigned long long via to_ulong() or to_ullong(). there are
- a few places in the code where to_ullong() is used, however if VSZ is larger
- than a value the host can support then bitset will throw a runtime exception.
-
- we should remove all use of to_long() or to_ullong() so we can have VSZ
- greater than 64b, however until that is done this assert is required.
- */
-static_assert(VSZ <= sizeof(unsigned long long) * 8,
- "VSZ is larger than the host can support");
-
-typedef std::bitset<VSZ> VectorMask;
+typedef std::bitset<std::numeric_limits<unsigned long long>::digits> VectorMask;
typedef std::shared_ptr<GPUDynInst> GPUDynInstPtr;
class WaitClass
diff --git a/src/gpu-compute/qstruct.hh b/src/gpu-compute/qstruct.hh
index 092303c00..7bca757b8 100644
--- a/src/gpu-compute/qstruct.hh
+++ b/src/gpu-compute/qstruct.hh
@@ -100,7 +100,7 @@ struct WFContext
{
// 32 bit values
// barrier state
- int bar_cnt[VSZ];
+ std::vector<int> bar_cnt;
// id (which WF in the WG)
int cnt;
diff --git a/src/gpu-compute/vector_register_file.cc b/src/gpu-compute/vector_register_file.cc
index 8b7dc0691..c43d765af 100644
--- a/src/gpu-compute/vector_register_file.cc
+++ b/src/gpu-compute/vector_register_file.cc
@@ -63,7 +63,7 @@ VectorRegisterFile::VectorRegisterFile(const VectorRegisterFileParams *p)
nxtBusy.clear();
nxtBusy.resize(numRegsPerSimd, 0);
- vgprState->init(numRegsPerSimd);
+ vgprState->init(numRegsPerSimd, p->wfSize);
}
void
diff --git a/src/gpu-compute/vector_register_state.cc b/src/gpu-compute/vector_register_state.cc
index f231b0579..e177d3b64 100644
--- a/src/gpu-compute/vector_register_state.cc
+++ b/src/gpu-compute/vector_register_state.cc
@@ -35,6 +35,8 @@
#include "gpu-compute/vector_register_state.hh"
+#include <limits>
+
#include "gpu-compute/compute_unit.hh"
VecRegisterState::VecRegisterState() : computeUnit(nullptr)
@@ -51,8 +53,19 @@ VecRegisterState::setParent(ComputeUnit *_computeUnit)
}
void
-VecRegisterState::init(uint32_t _size)
+VecRegisterState::init(uint32_t _size, uint32_t wf_size)
{
s_reg.resize(_size);
+ fatal_if(wf_size > std::numeric_limits<unsigned long long>::digits ||
+ wf_size <= 0,
+ "WF size is larger than the host can support or is zero");
+ fatal_if((wf_size & (wf_size - 1)) != 0,
+ "Wavefront size should be a power of 2");
+ for (int i = 0; i < s_reg.size(); ++i) {
+ s_reg[i].resize(wf_size, 0);
+ }
d_reg.resize(_size);
+ for (int i = 0; i < d_reg.size(); ++i) {
+ d_reg[i].resize(wf_size, 0);
+ }
}
diff --git a/src/gpu-compute/vector_register_state.hh b/src/gpu-compute/vector_register_state.hh
index a233b9acc..97a0d8e25 100644
--- a/src/gpu-compute/vector_register_state.hh
+++ b/src/gpu-compute/vector_register_state.hh
@@ -51,7 +51,7 @@ class VecRegisterState
{
public:
VecRegisterState();
- void init(uint32_t _size);
+ void init(uint32_t _size, uint32_t wf_size);
const std::string& name() const { return _name; }
void setParent(ComputeUnit *_computeUnit);
@@ -93,9 +93,9 @@ class VecRegisterState
ComputeUnit *computeUnit;
std::string _name;
// 32-bit Single Precision Vector Register State
- std::vector<std::array<uint32_t, VSZ>> s_reg;
+ std::vector<std::vector<uint32_t>> s_reg;
// 64-bit Double Precision Vector Register State
- std::vector<std::array<uint64_t, VSZ>> d_reg;
+ std::vector<std::vector<uint64_t>> d_reg;
};
#endif // __VECTOR_REGISTER_STATE_HH__
diff --git a/src/gpu-compute/wavefront.cc b/src/gpu-compute/wavefront.cc
index 7cdec53e5..a20330082 100644
--- a/src/gpu-compute/wavefront.cc
+++ b/src/gpu-compute/wavefront.cc
@@ -55,7 +55,6 @@ Wavefront::Wavefront(const Params *p)
last_trace = 0;
simdId = p->simdId;
wfSlotId = p->wf_slot_id;
-
status = S_STOPPED;
reservedVectorRegs = 0;
startVgprIndex = 0;
@@ -77,12 +76,20 @@ Wavefront::Wavefront(const Params *p)
mem_trace_busy = 0;
old_vgpr_tcnt = 0xffffffffffffffffll;
old_dgpr_tcnt = 0xffffffffffffffffll;
+ old_vgpr.resize(p->wfSize);
pendingFetch = false;
dropFetch = false;
condRegState = new ConditionRegisterState();
maxSpVgprs = 0;
maxDpVgprs = 0;
+ last_addr.resize(p->wfSize);
+ workitemFlatId.resize(p->wfSize);
+ old_dgpr.resize(p->wfSize);
+ bar_cnt.resize(p->wfSize);
+ for (int i = 0; i < 3; ++i) {
+ workitemid[i].resize(p->wfSize);
+ }
}
void
@@ -144,6 +151,7 @@ Wavefront::~Wavefront()
{
if (callArgMem)
delete callArgMem;
+ delete condRegState;
}
void
diff --git a/src/gpu-compute/wavefront.hh b/src/gpu-compute/wavefront.hh
index 0abab8e83..5a5386a3d 100644
--- a/src/gpu-compute/wavefront.hh
+++ b/src/gpu-compute/wavefront.hh
@@ -83,6 +83,7 @@ class CallArgMem
public:
// pointer to buffer for storing function arguments
uint8_t *mem;
+ int wfSize;
// size of function args
int funcArgsSizePerItem;
@@ -90,13 +91,13 @@ class CallArgMem
int
getLaneOffset(int lane, int addr)
{
- return addr * VSZ + sizeof(CType) * lane;
+ return addr * wfSize + sizeof(CType) * lane;
}
- CallArgMem(int func_args_size_per_item)
- : funcArgsSizePerItem(func_args_size_per_item)
+ CallArgMem(int func_args_size_per_item, int wf_size)
+ : wfSize(wf_size), funcArgsSizePerItem(func_args_size_per_item)
{
- mem = (uint8_t*)malloc(funcArgsSizePerItem * VSZ);
+ mem = (uint8_t*)malloc(funcArgsSizePerItem * wfSize);
}
~CallArgMem()
@@ -192,9 +193,9 @@ class Wavefront : public SimObject
bool isOldestInstALU();
bool isOldestInstBarrier();
// used for passing spill address to DDInstGPU
- uint64_t last_addr[VSZ];
- uint32_t workitemid[3][VSZ];
- uint32_t workitemFlatId[VSZ];
+ std::vector<Addr> last_addr;
+ std::vector<uint32_t> workitemid[3];
+ std::vector<uint32_t> workitemFlatId;
uint32_t workgroupid[3];
uint32_t workgroupsz[3];
uint32_t gridsz[3];
@@ -230,14 +231,14 @@ class Wavefront : public SimObject
uint32_t startVgprIndex;
// Old value of destination gpr (for trace)
- uint32_t old_vgpr[VSZ];
+ std::vector<uint32_t> old_vgpr;
// Id of destination gpr (for trace)
uint32_t old_vgpr_id;
// Tick count of last old_vgpr copy
uint64_t old_vgpr_tcnt;
// Old value of destination gpr (for trace)
- uint64_t old_dgpr[VSZ];
+ std::vector<uint64_t> old_dgpr;
// Id of destination gpr (for trace)
uint32_t old_dgpr_id;
// Tick count of last old_vgpr copy
@@ -247,7 +248,7 @@ class Wavefront : public SimObject
VectorMask init_mask;
// number of barriers this WF has joined
- int bar_cnt[VSZ];
+ std::vector<int> bar_cnt;
int max_bar_cnt;
// Flag to stall a wave on barrier
bool stalledAtBarrier;
@@ -296,9 +297,9 @@ class Wavefront : public SimObject
// argument memory for hsail call instruction
CallArgMem *callArgMem;
void
- initCallArgMem(int func_args_size_per_item)
+ initCallArgMem(int func_args_size_per_item, int wf_size)
{
- callArgMem = new CallArgMem(func_args_size_per_item);
+ callArgMem = new CallArgMem(func_args_size_per_item, wf_size);
}
template<typename CType>
@@ -327,7 +328,6 @@ class Wavefront : public SimObject
}
void start(uint64_t _wfDynId, uint64_t _base_ptr);
-
void exec();
void updateResources();
int ready(itype_e type);