summaryrefslogtreecommitdiff
path: root/src/gpu-compute
diff options
context:
space:
mode:
Diffstat (limited to 'src/gpu-compute')
-rw-r--r--src/gpu-compute/GPU.py2
-rw-r--r--src/gpu-compute/cl_driver.cc2
-rw-r--r--src/gpu-compute/compute_unit.cc117
-rw-r--r--src/gpu-compute/compute_unit.hh18
-rw-r--r--src/gpu-compute/dispatcher.cc6
-rw-r--r--src/gpu-compute/dispatcher.hh1
-rw-r--r--src/gpu-compute/global_memory_pipeline.cc4
-rw-r--r--src/gpu-compute/gpu_dyn_inst.cc22
-rw-r--r--src/gpu-compute/gpu_dyn_inst.hh10
-rw-r--r--src/gpu-compute/local_memory_pipeline.cc4
-rw-r--r--src/gpu-compute/misc.hh18
-rw-r--r--src/gpu-compute/qstruct.hh2
-rw-r--r--src/gpu-compute/vector_register_file.cc2
-rw-r--r--src/gpu-compute/vector_register_state.cc15
-rw-r--r--src/gpu-compute/vector_register_state.hh6
-rw-r--r--src/gpu-compute/wavefront.cc10
-rw-r--r--src/gpu-compute/wavefront.hh26
17 files changed, 151 insertions, 114 deletions
diff --git a/src/gpu-compute/GPU.py b/src/gpu-compute/GPU.py
index bd95f6335..f580a09f7 100644
--- a/src/gpu-compute/GPU.py
+++ b/src/gpu-compute/GPU.py
@@ -59,6 +59,7 @@ class VectorRegisterFile(SimObject):
simd_id = Param.Int(0, 'SIMD ID associated with this VRF')
num_regs_per_simd = Param.Int(2048, 'number of vector registers per SIMD')
+ wfSize = Param.Int(64, 'Wavefront size (in work items)')
min_alloc = Param.Int(4, 'min number of VGPRs allocated per WF')
class Wavefront(SimObject):
@@ -68,6 +69,7 @@ class Wavefront(SimObject):
simdId = Param.Int('SIMD id (0-ComputeUnit.num_SIMDs)')
wf_slot_id = Param.Int('wavefront id (0-ComputeUnit.max_wfs)')
+ wfSize = Param.Int(64, 'Wavefront size (in work items)')
class ComputeUnit(MemObject):
type = 'ComputeUnit'
diff --git a/src/gpu-compute/cl_driver.cc b/src/gpu-compute/cl_driver.cc
index 3b3291c03..6bb6be102 100644
--- a/src/gpu-compute/cl_driver.cc
+++ b/src/gpu-compute/cl_driver.cc
@@ -238,7 +238,7 @@ ClDriver::ioctl(LiveProcess *process, ThreadContext *tc, unsigned req)
case HSA_GET_VSZ:
{
BufferArg buf(buf_addr, sizeof(uint32_t));
- *((uint32_t*)buf.bufferPtr()) = VSZ;
+ *((uint32_t*)buf.bufferPtr()) = dispatcher->wfSize();
buf.copyOut(tc->getMemProxy());
}
break;
diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc
index b3a99b182..5ec061172 100644
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -32,9 +32,10 @@
*
* Author: John Kalamatianos, Anthony Gutierrez
*/
-
#include "gpu-compute/compute_unit.hh"
+#include <limits>
+
#include "base/output.hh"
#include "debug/GPUDisp.hh"
#include "debug/GPUExec.hh"
@@ -76,14 +77,27 @@ ComputeUnit::ComputeUnit(const Params *p) : MemObject(p), fetchStage(p),
_masterId(p->system->getMasterId(name() + ".ComputeUnit")),
lds(*p->localDataStore), globalSeqNum(0), wavefrontSize(p->wfSize)
{
- // this check will be eliminated once we have wavefront size support added
- fatal_if(p->wfSize != VSZ, "Wavefront size parameter does not match VSZ");
+ /**
+ * This check is necessary because std::bitset only provides conversion
+ * to unsigned long or unsigned long long via to_ulong() or to_ullong().
+ * there are * a few places in the code where to_ullong() is used, however
+ * if VSZ is larger than a value the host can support then bitset will
+ * throw a runtime exception. we should remove all use of to_long() or
+ * to_ullong() so we can have VSZ greater than 64b, however until that is
+ * done this assert is required.
+ */
+ fatal_if(p->wfSize > std::numeric_limits<unsigned long long>::digits ||
+ p->wfSize <= 0,
+ "WF size is larger than the host can support");
+ fatal_if(!isPowerOf2(wavefrontSize),
+ "Wavefront size should be a power of 2");
// calculate how many cycles a vector load or store will need to transfer
// its data over the corresponding buses
- numCyclesPerStoreTransfer = (uint32_t)ceil((double)(VSZ * sizeof(uint32_t))
- / (double)vrfToCoalescerBusWidth);
+ numCyclesPerStoreTransfer =
+ (uint32_t)ceil((double)(wfSize() * sizeof(uint32_t)) /
+ (double)vrfToCoalescerBusWidth);
- numCyclesPerLoadTransfer = (VSZ * sizeof(uint32_t))
+ numCyclesPerLoadTransfer = (wfSize() * sizeof(uint32_t))
/ coalescerToVrfBusWidth;
lastVaddrWF.resize(numSIMDs);
@@ -93,24 +107,24 @@ ComputeUnit::ComputeUnit(const Params *p) : MemObject(p), fetchStage(p),
lastVaddrWF[j].resize(p->n_wf);
for (int i = 0; i < p->n_wf; ++i) {
- lastVaddrWF[j][i].resize(VSZ);
+ lastVaddrWF[j][i].resize(wfSize());
wfList[j].push_back(p->wavefronts[j * p->n_wf + i]);
wfList[j][i]->setParent(this);
- for (int k = 0; k < VSZ; ++k) {
+ for (int k = 0; k < wfSize(); ++k) {
lastVaddrWF[j][i][k] = 0;
}
}
}
- lastVaddrPhase.resize(numSIMDs);
+ lastVaddrSimd.resize(numSIMDs);
for (int i = 0; i < numSIMDs; ++i) {
- lastVaddrPhase[i] = LastVaddrWave();
+ lastVaddrSimd[i].resize(wfSize(), 0);
}
- lastVaddrCU = LastVaddrWave();
+ lastVaddrCU.resize(wfSize());
lds.setParent(this);
@@ -122,10 +136,10 @@ ComputeUnit::ComputeUnit(const Params *p) : MemObject(p), fetchStage(p),
fatal("Invalid WF execution policy (CU)\n");
}
- memPort.resize(VSZ);
+ memPort.resize(wfSize());
// resize the tlbPort vectorArray
- int tlbPort_width = perLaneTLB ? VSZ : 1;
+ int tlbPort_width = perLaneTLB ? wfSize() : 1;
tlbPort.resize(tlbPort_width);
cuExitCallback = new CUExitCallback(this);
@@ -144,12 +158,13 @@ ComputeUnit::ComputeUnit(const Params *p) : MemObject(p), fetchStage(p),
ComputeUnit::~ComputeUnit()
{
// Delete wavefront slots
-
- for (int j = 0; j < numSIMDs; ++j)
+ for (int j = 0; j < numSIMDs; ++j) {
for (int i = 0; i < shader->n_wf; ++i) {
delete wfList[j][i];
}
-
+ lastVaddrSimd[j].clear();
+ }
+ lastVaddrCU.clear();
readyList.clear();
waveStatusList.clear();
dispatchList.clear();
@@ -187,27 +202,25 @@ ComputeUnit::InitializeWFContext(WFContext *wfCtx, NDRange *ndr, int cnt,
VectorMask init_mask;
init_mask.reset();
- for (int k = 0; k < VSZ; ++k) {
- if (k + cnt * VSZ < trueWgSizeTotal)
+ for (int k = 0; k < wfSize(); ++k) {
+ if (k + cnt * wfSize() < trueWgSizeTotal)
init_mask[k] = 1;
}
wfCtx->init_mask = init_mask.to_ullong();
wfCtx->exec_mask = init_mask.to_ullong();
- for (int i = 0; i < VSZ; ++i) {
- wfCtx->bar_cnt[i] = 0;
- }
+ wfCtx->bar_cnt.resize(wfSize(), 0);
wfCtx->max_bar_cnt = 0;
wfCtx->old_barrier_cnt = 0;
wfCtx->barrier_cnt = 0;
wfCtx->privBase = ndr->q.privMemStart;
- ndr->q.privMemStart += ndr->q.privMemPerItem * VSZ;
+ ndr->q.privMemStart += ndr->q.privMemPerItem * wfSize();
wfCtx->spillBase = ndr->q.spillMemStart;
- ndr->q.spillMemStart += ndr->q.spillMemPerItem * VSZ;
+ ndr->q.spillMemStart += ndr->q.spillMemPerItem * wfSize();
wfCtx->pc = 0;
wfCtx->rpc = UINT32_MAX;
@@ -265,10 +278,12 @@ ComputeUnit::StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[],
w->dynwaveid = cnt;
w->init_mask = wfCtx->init_mask;
- for (int k = 0; k < VSZ; ++k) {
- w->workitemid[0][k] = (k+cnt*VSZ) % trueWgSize[0];
- w->workitemid[1][k] = ((k + cnt * VSZ) / trueWgSize[0]) % trueWgSize[1];
- w->workitemid[2][k] = (k + cnt * VSZ) / (trueWgSize[0] * trueWgSize[1]);
+ for (int k = 0; k < wfSize(); ++k) {
+ w->workitemid[0][k] = (k+cnt*wfSize()) % trueWgSize[0];
+ w->workitemid[1][k] =
+ ((k + cnt * wfSize()) / trueWgSize[0]) % trueWgSize[1];
+ w->workitemid[2][k] =
+ (k + cnt * wfSize()) / (trueWgSize[0] * trueWgSize[1]);
w->workitemFlatId[k] = w->workitemid[2][k] * trueWgSize[0] *
trueWgSize[1] + w->workitemid[1][k] * trueWgSize[0] +
@@ -277,9 +292,9 @@ ComputeUnit::StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[],
w->old_barrier_cnt = wfCtx->old_barrier_cnt;
w->barrier_cnt = wfCtx->barrier_cnt;
- w->barrier_slots = divCeil(trueWgSizeTotal, VSZ);
+ w->barrier_slots = divCeil(trueWgSizeTotal, wfSize());
- for (int i = 0; i < VSZ; ++i) {
+ for (int i = 0; i < wfSize(); ++i) {
w->bar_cnt[i] = wfCtx->bar_cnt[i];
}
@@ -315,16 +330,17 @@ ComputeUnit::StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[],
// is this the last wavefront in the workgroup
// if set the spillWidth to be the remaining work-items
// so that the vector access is correct
- if ((cnt + 1) * VSZ >= trueWgSizeTotal) {
- w->spillWidth = trueWgSizeTotal - (cnt * VSZ);
+ if ((cnt + 1) * wfSize() >= trueWgSizeTotal) {
+ w->spillWidth = trueWgSizeTotal - (cnt * wfSize());
} else {
- w->spillWidth = VSZ;
+ w->spillWidth = wfSize();
}
DPRINTF(GPUDisp, "Scheduling wfDynId/barrier_id %d/%d on CU%d: "
"WF[%d][%d]\n", _n_wave, barrier_id, cu_id, w->simdId, w->wfSlotId);
w->start(++_n_wave, ndr->q.code_ptr);
+ wfCtx->bar_cnt.clear();
}
void
@@ -339,7 +355,7 @@ ComputeUnit::StartWorkgroup(NDRange *ndr)
// Send L1 cache acquire
// isKernel + isAcquire = Kernel Begin
if (shader->impl_kern_boundary_sync) {
- GPUDynInstPtr gpuDynInst = std::make_shared<GPUDynInst>(nullptr,
+ GPUDynInstPtr gpuDynInst = std::make_shared<GPUDynInst>(this,
nullptr,
nullptr, 0);
@@ -374,7 +390,7 @@ ComputeUnit::StartWorkgroup(NDRange *ndr)
if (w->status == Wavefront::S_STOPPED) {
// if we have scheduled all work items then stop
// scheduling wavefronts
- if (cnt * VSZ >= trueWgSizeTotal)
+ if (cnt * wfSize() >= trueWgSizeTotal)
break;
// reserve vector registers for the scheduled wavefront
@@ -420,7 +436,7 @@ ComputeUnit::ReadyWorkgroup(NDRange *ndr)
// work item of the work group
int vregDemandPerWI = ndr->q.sRegCount + (2 * ndr->q.dRegCount);
bool vregAvail = true;
- int numWfs = (trueWgSizeTotal + VSZ - 1) / VSZ;
+ int numWfs = (trueWgSizeTotal + wfSize() - 1) / wfSize();
int freeWfSlots = 0;
// check if the total number of VGPRs required by all WFs of the WG
// fit in the VRFs of all SIMD units
@@ -623,7 +639,7 @@ ComputeUnit::init()
// Setup space for call args
for (int j = 0; j < numSIMDs; ++j) {
for (int i = 0; i < shader->n_wf; ++i) {
- wfList[j][i]->initCallArgMem(shader->funcargs_size);
+ wfList[j][i]->initCallArgMem(shader->funcargs_size, wavefrontSize);
}
}
@@ -1193,15 +1209,15 @@ ComputeUnit::DTLBPort::recvTimingResp(PacketPtr pkt)
Addr last = 0;
switch(computeUnit->prefetchType) {
- case Enums::PF_CU:
+ case Enums::PF_CU:
last = computeUnit->lastVaddrCU[mp_index];
break;
- case Enums::PF_PHASE:
- last = computeUnit->lastVaddrPhase[simdId][mp_index];
+ case Enums::PF_PHASE:
+ last = computeUnit->lastVaddrSimd[simdId][mp_index];
break;
- case Enums::PF_WF:
+ case Enums::PF_WF:
last = computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index];
- default:
+ default:
break;
}
@@ -1215,7 +1231,7 @@ ComputeUnit::DTLBPort::recvTimingResp(PacketPtr pkt)
DPRINTF(GPUPrefetch, "Stride is %d\n", stride);
computeUnit->lastVaddrCU[mp_index] = vaddr;
- computeUnit->lastVaddrPhase[simdId][mp_index] = vaddr;
+ computeUnit->lastVaddrSimd[simdId][mp_index] = vaddr;
computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index] = vaddr;
stride = (computeUnit->prefetchType == Enums::PF_STRIDE) ?
@@ -1488,7 +1504,7 @@ ComputeUnit::regStats()
;
ldsBankConflictDist
- .init(0, VSZ, 2)
+ .init(0, wfSize(), 2)
.name(name() + ".lds_bank_conflicts")
.desc("Number of bank conflicts per LDS memory packet")
;
@@ -1499,27 +1515,28 @@ ComputeUnit::regStats()
;
pageDivergenceDist
- // A wavefront can touch 1 to VSZ pages per memory instruction.
- // The number of pages per bin can be configured (here it's 4).
- .init(1, VSZ, 4)
+ // A wavefront can touch up to N pages per memory instruction where
+ // N is equal to the wavefront size
+ // The number of pages per bin can be configured (here it's 4).
+ .init(1, wfSize(), 4)
.name(name() + ".page_divergence_dist")
.desc("pages touched per wf (over all mem. instr.)")
;
controlFlowDivergenceDist
- .init(1, VSZ, 4)
+ .init(1, wfSize(), 4)
.name(name() + ".warp_execution_dist")
.desc("number of lanes active per instruction (oval all instructions)")
;
activeLanesPerGMemInstrDist
- .init(1, VSZ, 4)
+ .init(1, wfSize(), 4)
.name(name() + ".gmem_lanes_execution_dist")
.desc("number of active lanes per global memory instruction")
;
activeLanesPerLMemInstrDist
- .init(1, VSZ, 4)
+ .init(1, wfSize(), 4)
.name(name() + ".lmem_lanes_execution_dist")
.desc("number of active lanes per local memory instruction")
;
@@ -1531,7 +1548,7 @@ ComputeUnit::regStats()
numVecOpsExecuted
.name(name() + ".num_vec_ops_executed")
- .desc("number of vec ops executed (e.g. VSZ/inst)")
+ .desc("number of vec ops executed (e.g. WF size/inst)")
;
totalCycles
diff --git a/src/gpu-compute/compute_unit.hh b/src/gpu-compute/compute_unit.hh
index f47c27a0a..a234cbeb5 100644
--- a/src/gpu-compute/compute_unit.hh
+++ b/src/gpu-compute/compute_unit.hh
@@ -161,22 +161,8 @@ class ComputeUnit : public MemObject
// if fixed-stride prefetching, this is the stride.
int prefetchStride;
- class LastVaddrWave
- {
- public:
- Addr vaddrs[VSZ];
- Addr& operator[](int idx) {
- return vaddrs[idx];
- }
-
- LastVaddrWave() {
- for (int i = 0; i < VSZ; ++i)
- vaddrs[i] = 0;
- }
- };
-
- LastVaddrWave lastVaddrCU;
- std::vector<LastVaddrWave> lastVaddrPhase;
+ std::vector<Addr> lastVaddrCU;
+ std::vector<std::vector<Addr>> lastVaddrSimd;
std::vector<std::vector<std::vector<Addr>>> lastVaddrWF;
Enums::PrefetchType prefetchType;
EXEC_POLICY exec_policy;
diff --git a/src/gpu-compute/dispatcher.cc b/src/gpu-compute/dispatcher.cc
index 95c0c56a2..d1d011c0d 100644
--- a/src/gpu-compute/dispatcher.cc
+++ b/src/gpu-compute/dispatcher.cc
@@ -387,6 +387,12 @@ GpuDispatcher::getNumCUs()
return shader->cuList.size();
}
+int
+GpuDispatcher::wfSize() const
+{
+ return shader->cuList[0]->wfSize();
+}
+
void
GpuDispatcher::setFuncargsSize(int funcargs_size)
{
diff --git a/src/gpu-compute/dispatcher.hh b/src/gpu-compute/dispatcher.hh
index 76f932655..e984af494 100644
--- a/src/gpu-compute/dispatcher.hh
+++ b/src/gpu-compute/dispatcher.hh
@@ -157,6 +157,7 @@ class GpuDispatcher : public DmaDevice
// helper functions to retrieve/set GPU attributes
int getNumCUs();
+ int wfSize() const;
void setFuncargsSize(int funcargs_size);
};
diff --git a/src/gpu-compute/global_memory_pipeline.cc b/src/gpu-compute/global_memory_pipeline.cc
index 355018666..a6a4d86db 100644
--- a/src/gpu-compute/global_memory_pipeline.cc
+++ b/src/gpu-compute/global_memory_pipeline.cc
@@ -179,9 +179,9 @@ GlobalMemPipeline::doGmReturn(GPUDynInstPtr m)
int physVgpr = w->remap(dst, sizeof(c0), 1);
// save the physical VGPR index
regVec.push_back(physVgpr);
- c1 *p1 = &((c1*)m->d_data)[k * VSZ];
+ c1 *p1 = &((c1 *)m->d_data)[k * w->computeUnit->wfSize()];
- for (int i = 0; i < VSZ; ++i) {
+ for (int i = 0; i < w->computeUnit->wfSize(); ++i) {
if (m->exec_mask[i]) {
DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: "
"$%s%d <- %d global ld done (src = wavefront "
diff --git a/src/gpu-compute/gpu_dyn_inst.cc b/src/gpu-compute/gpu_dyn_inst.cc
index 2f35a983c..1806e79e4 100644
--- a/src/gpu-compute/gpu_dyn_inst.cc
+++ b/src/gpu-compute/gpu_dyn_inst.cc
@@ -42,11 +42,29 @@
GPUDynInst::GPUDynInst(ComputeUnit *_cu, Wavefront *_wf,
GPUStaticInst *_staticInst, uint64_t instSeqNum)
- : GPUExecContext(_cu, _wf), m_op(Enums::MO_UNDEF),
+ : GPUExecContext(_cu, _wf), addr(computeUnit()->wfSize(), (Addr)0),
+ m_op(Enums::MO_UNDEF),
memoryOrder(Enums::MEMORY_ORDER_NONE), n_reg(0), useContinuation(false),
statusBitVector(0), staticInst(_staticInst), _seqNum(instSeqNum)
{
- tlbHitLevel.assign(VSZ, -1);
+ tlbHitLevel.assign(computeUnit()->wfSize(), -1);
+ d_data = new uint8_t[computeUnit()->wfSize() * 16];
+ a_data = new uint8_t[computeUnit()->wfSize() * 8];
+ x_data = new uint8_t[computeUnit()->wfSize() * 8];
+ for (int i = 0; i < (computeUnit()->wfSize() * 8); ++i) {
+ a_data[i] = 0;
+ x_data[i] = 0;
+ }
+ for (int i = 0; i < (computeUnit()->wfSize() * 16); ++i) {
+ d_data[i] = 0;
+ }
+}
+
+GPUDynInst::~GPUDynInst()
+{
+ delete[] d_data;
+ delete[] a_data;
+ delete[] x_data;
}
void
diff --git a/src/gpu-compute/gpu_dyn_inst.hh b/src/gpu-compute/gpu_dyn_inst.hh
index e44d8f80d..46774d867 100644
--- a/src/gpu-compute/gpu_dyn_inst.hh
+++ b/src/gpu-compute/gpu_dyn_inst.hh
@@ -205,7 +205,7 @@ class GPUDynInst : public GPUExecContext
public:
GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *_staticInst,
uint64_t instSeqNum);
-
+ ~GPUDynInst();
void execute();
int numSrcRegOperands();
int numDstRegOperands();
@@ -226,15 +226,15 @@ class GPUDynInst : public GPUExecContext
Enums::StorageClassType executedAs();
// The address of the memory operation
- Addr addr[VSZ];
+ std::vector<Addr> addr;
Addr pAddr;
// The data to get written
- uint8_t d_data[VSZ * 16];
+ uint8_t *d_data;
// Additional data (for atomics)
- uint8_t a_data[VSZ * 8];
+ uint8_t *a_data;
// Additional data (for atomics)
- uint8_t x_data[VSZ * 8];
+ uint8_t *x_data;
// The execution mask
VectorMask exec_mask;
diff --git a/src/gpu-compute/local_memory_pipeline.cc b/src/gpu-compute/local_memory_pipeline.cc
index 7f919c5f4..a970d8f9b 100644
--- a/src/gpu-compute/local_memory_pipeline.cc
+++ b/src/gpu-compute/local_memory_pipeline.cc
@@ -148,9 +148,9 @@ LocalMemPipeline::doSmReturn(GPUDynInstPtr m)
int physVgpr = w->remap(dst,sizeof(c0),1);
// save the physical VGPR index
regVec.push_back(physVgpr);
- c1 *p1 = &((c1*)m->d_data)[k * VSZ];
+ c1 *p1 = &((c1 *)m->d_data)[k * w->computeUnit->wfSize()];
- for (int i = 0; i < VSZ; ++i) {
+ for (int i = 0; i < w->computeUnit->wfSize(); ++i) {
if (m->exec_mask[i]) {
// write the value into the physical VGPR. This is a purely
// functional operation. No timing is modeled.
diff --git a/src/gpu-compute/misc.hh b/src/gpu-compute/misc.hh
index 4f8032832..5ade89789 100644
--- a/src/gpu-compute/misc.hh
+++ b/src/gpu-compute/misc.hh
@@ -37,28 +37,14 @@
#define __MISC_HH__
#include <bitset>
+#include <limits>
#include <memory>
#include "base/misc.hh"
class GPUDynInst;
-// wavefront size of the machine
-static const int VSZ = 64;
-
-/*
- This check is necessary because std::bitset only provides conversion to
- unsigned long or unsigned long long via to_ulong() or to_ullong(). there are
- a few places in the code where to_ullong() is used, however if VSZ is larger
- than a value the host can support then bitset will throw a runtime exception.
-
- we should remove all use of to_long() or to_ullong() so we can have VSZ
- greater than 64b, however until that is done this assert is required.
- */
-static_assert(VSZ <= sizeof(unsigned long long) * 8,
- "VSZ is larger than the host can support");
-
-typedef std::bitset<VSZ> VectorMask;
+typedef std::bitset<std::numeric_limits<unsigned long long>::digits> VectorMask;
typedef std::shared_ptr<GPUDynInst> GPUDynInstPtr;
class WaitClass
diff --git a/src/gpu-compute/qstruct.hh b/src/gpu-compute/qstruct.hh
index 092303c00..7bca757b8 100644
--- a/src/gpu-compute/qstruct.hh
+++ b/src/gpu-compute/qstruct.hh
@@ -100,7 +100,7 @@ struct WFContext
{
// 32 bit values
// barrier state
- int bar_cnt[VSZ];
+ std::vector<int> bar_cnt;
// id (which WF in the WG)
int cnt;
diff --git a/src/gpu-compute/vector_register_file.cc b/src/gpu-compute/vector_register_file.cc
index 8b7dc0691..c43d765af 100644
--- a/src/gpu-compute/vector_register_file.cc
+++ b/src/gpu-compute/vector_register_file.cc
@@ -63,7 +63,7 @@ VectorRegisterFile::VectorRegisterFile(const VectorRegisterFileParams *p)
nxtBusy.clear();
nxtBusy.resize(numRegsPerSimd, 0);
- vgprState->init(numRegsPerSimd);
+ vgprState->init(numRegsPerSimd, p->wfSize);
}
void
diff --git a/src/gpu-compute/vector_register_state.cc b/src/gpu-compute/vector_register_state.cc
index f231b0579..e177d3b64 100644
--- a/src/gpu-compute/vector_register_state.cc
+++ b/src/gpu-compute/vector_register_state.cc
@@ -35,6 +35,8 @@
#include "gpu-compute/vector_register_state.hh"
+#include <limits>
+
#include "gpu-compute/compute_unit.hh"
VecRegisterState::VecRegisterState() : computeUnit(nullptr)
@@ -51,8 +53,19 @@ VecRegisterState::setParent(ComputeUnit *_computeUnit)
}
void
-VecRegisterState::init(uint32_t _size)
+VecRegisterState::init(uint32_t _size, uint32_t wf_size)
{
s_reg.resize(_size);
+ fatal_if(wf_size > std::numeric_limits<unsigned long long>::digits ||
+ wf_size <= 0,
+ "WF size is larger than the host can support or is zero");
+ fatal_if((wf_size & (wf_size - 1)) != 0,
+ "Wavefront size should be a power of 2");
+ for (int i = 0; i < s_reg.size(); ++i) {
+ s_reg[i].resize(wf_size, 0);
+ }
d_reg.resize(_size);
+ for (int i = 0; i < d_reg.size(); ++i) {
+ d_reg[i].resize(wf_size, 0);
+ }
}
diff --git a/src/gpu-compute/vector_register_state.hh b/src/gpu-compute/vector_register_state.hh
index a233b9acc..97a0d8e25 100644
--- a/src/gpu-compute/vector_register_state.hh
+++ b/src/gpu-compute/vector_register_state.hh
@@ -51,7 +51,7 @@ class VecRegisterState
{
public:
VecRegisterState();
- void init(uint32_t _size);
+ void init(uint32_t _size, uint32_t wf_size);
const std::string& name() const { return _name; }
void setParent(ComputeUnit *_computeUnit);
@@ -93,9 +93,9 @@ class VecRegisterState
ComputeUnit *computeUnit;
std::string _name;
// 32-bit Single Precision Vector Register State
- std::vector<std::array<uint32_t, VSZ>> s_reg;
+ std::vector<std::vector<uint32_t>> s_reg;
// 64-bit Double Precision Vector Register State
- std::vector<std::array<uint64_t, VSZ>> d_reg;
+ std::vector<std::vector<uint64_t>> d_reg;
};
#endif // __VECTOR_REGISTER_STATE_HH__
diff --git a/src/gpu-compute/wavefront.cc b/src/gpu-compute/wavefront.cc
index 7cdec53e5..a20330082 100644
--- a/src/gpu-compute/wavefront.cc
+++ b/src/gpu-compute/wavefront.cc
@@ -55,7 +55,6 @@ Wavefront::Wavefront(const Params *p)
last_trace = 0;
simdId = p->simdId;
wfSlotId = p->wf_slot_id;
-
status = S_STOPPED;
reservedVectorRegs = 0;
startVgprIndex = 0;
@@ -77,12 +76,20 @@ Wavefront::Wavefront(const Params *p)
mem_trace_busy = 0;
old_vgpr_tcnt = 0xffffffffffffffffll;
old_dgpr_tcnt = 0xffffffffffffffffll;
+ old_vgpr.resize(p->wfSize);
pendingFetch = false;
dropFetch = false;
condRegState = new ConditionRegisterState();
maxSpVgprs = 0;
maxDpVgprs = 0;
+ last_addr.resize(p->wfSize);
+ workitemFlatId.resize(p->wfSize);
+ old_dgpr.resize(p->wfSize);
+ bar_cnt.resize(p->wfSize);
+ for (int i = 0; i < 3; ++i) {
+ workitemid[i].resize(p->wfSize);
+ }
}
void
@@ -144,6 +151,7 @@ Wavefront::~Wavefront()
{
if (callArgMem)
delete callArgMem;
+ delete condRegState;
}
void
diff --git a/src/gpu-compute/wavefront.hh b/src/gpu-compute/wavefront.hh
index 0abab8e83..5a5386a3d 100644
--- a/src/gpu-compute/wavefront.hh
+++ b/src/gpu-compute/wavefront.hh
@@ -83,6 +83,7 @@ class CallArgMem
public:
// pointer to buffer for storing function arguments
uint8_t *mem;
+ int wfSize;
// size of function args
int funcArgsSizePerItem;
@@ -90,13 +91,13 @@ class CallArgMem
int
getLaneOffset(int lane, int addr)
{
- return addr * VSZ + sizeof(CType) * lane;
+ return addr * wfSize + sizeof(CType) * lane;
}
- CallArgMem(int func_args_size_per_item)
- : funcArgsSizePerItem(func_args_size_per_item)
+ CallArgMem(int func_args_size_per_item, int wf_size)
+ : wfSize(wf_size), funcArgsSizePerItem(func_args_size_per_item)
{
- mem = (uint8_t*)malloc(funcArgsSizePerItem * VSZ);
+ mem = (uint8_t*)malloc(funcArgsSizePerItem * wfSize);
}
~CallArgMem()
@@ -192,9 +193,9 @@ class Wavefront : public SimObject
bool isOldestInstALU();
bool isOldestInstBarrier();
// used for passing spill address to DDInstGPU
- uint64_t last_addr[VSZ];
- uint32_t workitemid[3][VSZ];
- uint32_t workitemFlatId[VSZ];
+ std::vector<Addr> last_addr;
+ std::vector<uint32_t> workitemid[3];
+ std::vector<uint32_t> workitemFlatId;
uint32_t workgroupid[3];
uint32_t workgroupsz[3];
uint32_t gridsz[3];
@@ -230,14 +231,14 @@ class Wavefront : public SimObject
uint32_t startVgprIndex;
// Old value of destination gpr (for trace)
- uint32_t old_vgpr[VSZ];
+ std::vector<uint32_t> old_vgpr;
// Id of destination gpr (for trace)
uint32_t old_vgpr_id;
// Tick count of last old_vgpr copy
uint64_t old_vgpr_tcnt;
// Old value of destination gpr (for trace)
- uint64_t old_dgpr[VSZ];
+ std::vector<uint64_t> old_dgpr;
// Id of destination gpr (for trace)
uint32_t old_dgpr_id;
// Tick count of last old_vgpr copy
@@ -247,7 +248,7 @@ class Wavefront : public SimObject
VectorMask init_mask;
// number of barriers this WF has joined
- int bar_cnt[VSZ];
+ std::vector<int> bar_cnt;
int max_bar_cnt;
// Flag to stall a wave on barrier
bool stalledAtBarrier;
@@ -296,9 +297,9 @@ class Wavefront : public SimObject
// argument memory for hsail call instruction
CallArgMem *callArgMem;
void
- initCallArgMem(int func_args_size_per_item)
+ initCallArgMem(int func_args_size_per_item, int wf_size)
{
- callArgMem = new CallArgMem(func_args_size_per_item);
+ callArgMem = new CallArgMem(func_args_size_per_item, wf_size);
}
template<typename CType>
@@ -327,7 +328,6 @@ class Wavefront : public SimObject
}
void start(uint64_t _wfDynId, uint64_t _base_ptr);
-
void exec();
void updateResources();
int ready(itype_e type);