summaryrefslogtreecommitdiff
path: root/src/gpu-compute/compute_unit.cc
diff options
context:
space:
mode:
Diffstat (limited to 'src/gpu-compute/compute_unit.cc')
-rw-r--r--src/gpu-compute/compute_unit.cc117
1 files changed, 67 insertions, 50 deletions
diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc
index b3a99b182..5ec061172 100644
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -32,9 +32,10 @@
*
* Author: John Kalamatianos, Anthony Gutierrez
*/
-
#include "gpu-compute/compute_unit.hh"
+#include <limits>
+
#include "base/output.hh"
#include "debug/GPUDisp.hh"
#include "debug/GPUExec.hh"
@@ -76,14 +77,27 @@ ComputeUnit::ComputeUnit(const Params *p) : MemObject(p), fetchStage(p),
_masterId(p->system->getMasterId(name() + ".ComputeUnit")),
lds(*p->localDataStore), globalSeqNum(0), wavefrontSize(p->wfSize)
{
- // this check will be eliminated once we have wavefront size support added
- fatal_if(p->wfSize != VSZ, "Wavefront size parameter does not match VSZ");
+ /**
+ * This check is necessary because std::bitset only provides conversion
+ * to unsigned long or unsigned long long via to_ulong() or to_ullong().
+ * there are * a few places in the code where to_ullong() is used, however
+ * if VSZ is larger than a value the host can support then bitset will
+ * throw a runtime exception. we should remove all use of to_long() or
+ * to_ullong() so we can have VSZ greater than 64b, however until that is
+ * done this assert is required.
+ */
+ fatal_if(p->wfSize > std::numeric_limits<unsigned long long>::digits ||
+ p->wfSize <= 0,
+ "WF size is larger than the host can support");
+ fatal_if(!isPowerOf2(wavefrontSize),
+ "Wavefront size should be a power of 2");
// calculate how many cycles a vector load or store will need to transfer
// its data over the corresponding buses
- numCyclesPerStoreTransfer = (uint32_t)ceil((double)(VSZ * sizeof(uint32_t))
- / (double)vrfToCoalescerBusWidth);
+ numCyclesPerStoreTransfer =
+ (uint32_t)ceil((double)(wfSize() * sizeof(uint32_t)) /
+ (double)vrfToCoalescerBusWidth);
- numCyclesPerLoadTransfer = (VSZ * sizeof(uint32_t))
+ numCyclesPerLoadTransfer = (wfSize() * sizeof(uint32_t))
/ coalescerToVrfBusWidth;
lastVaddrWF.resize(numSIMDs);
@@ -93,24 +107,24 @@ ComputeUnit::ComputeUnit(const Params *p) : MemObject(p), fetchStage(p),
lastVaddrWF[j].resize(p->n_wf);
for (int i = 0; i < p->n_wf; ++i) {
- lastVaddrWF[j][i].resize(VSZ);
+ lastVaddrWF[j][i].resize(wfSize());
wfList[j].push_back(p->wavefronts[j * p->n_wf + i]);
wfList[j][i]->setParent(this);
- for (int k = 0; k < VSZ; ++k) {
+ for (int k = 0; k < wfSize(); ++k) {
lastVaddrWF[j][i][k] = 0;
}
}
}
- lastVaddrPhase.resize(numSIMDs);
+ lastVaddrSimd.resize(numSIMDs);
for (int i = 0; i < numSIMDs; ++i) {
- lastVaddrPhase[i] = LastVaddrWave();
+ lastVaddrSimd[i].resize(wfSize(), 0);
}
- lastVaddrCU = LastVaddrWave();
+ lastVaddrCU.resize(wfSize());
lds.setParent(this);
@@ -122,10 +136,10 @@ ComputeUnit::ComputeUnit(const Params *p) : MemObject(p), fetchStage(p),
fatal("Invalid WF execution policy (CU)\n");
}
- memPort.resize(VSZ);
+ memPort.resize(wfSize());
// resize the tlbPort vectorArray
- int tlbPort_width = perLaneTLB ? VSZ : 1;
+ int tlbPort_width = perLaneTLB ? wfSize() : 1;
tlbPort.resize(tlbPort_width);
cuExitCallback = new CUExitCallback(this);
@@ -144,12 +158,13 @@ ComputeUnit::ComputeUnit(const Params *p) : MemObject(p), fetchStage(p),
ComputeUnit::~ComputeUnit()
{
// Delete wavefront slots
-
- for (int j = 0; j < numSIMDs; ++j)
+ for (int j = 0; j < numSIMDs; ++j) {
for (int i = 0; i < shader->n_wf; ++i) {
delete wfList[j][i];
}
-
+ lastVaddrSimd[j].clear();
+ }
+ lastVaddrCU.clear();
readyList.clear();
waveStatusList.clear();
dispatchList.clear();
@@ -187,27 +202,25 @@ ComputeUnit::InitializeWFContext(WFContext *wfCtx, NDRange *ndr, int cnt,
VectorMask init_mask;
init_mask.reset();
- for (int k = 0; k < VSZ; ++k) {
- if (k + cnt * VSZ < trueWgSizeTotal)
+ for (int k = 0; k < wfSize(); ++k) {
+ if (k + cnt * wfSize() < trueWgSizeTotal)
init_mask[k] = 1;
}
wfCtx->init_mask = init_mask.to_ullong();
wfCtx->exec_mask = init_mask.to_ullong();
- for (int i = 0; i < VSZ; ++i) {
- wfCtx->bar_cnt[i] = 0;
- }
+ wfCtx->bar_cnt.resize(wfSize(), 0);
wfCtx->max_bar_cnt = 0;
wfCtx->old_barrier_cnt = 0;
wfCtx->barrier_cnt = 0;
wfCtx->privBase = ndr->q.privMemStart;
- ndr->q.privMemStart += ndr->q.privMemPerItem * VSZ;
+ ndr->q.privMemStart += ndr->q.privMemPerItem * wfSize();
wfCtx->spillBase = ndr->q.spillMemStart;
- ndr->q.spillMemStart += ndr->q.spillMemPerItem * VSZ;
+ ndr->q.spillMemStart += ndr->q.spillMemPerItem * wfSize();
wfCtx->pc = 0;
wfCtx->rpc = UINT32_MAX;
@@ -265,10 +278,12 @@ ComputeUnit::StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[],
w->dynwaveid = cnt;
w->init_mask = wfCtx->init_mask;
- for (int k = 0; k < VSZ; ++k) {
- w->workitemid[0][k] = (k+cnt*VSZ) % trueWgSize[0];
- w->workitemid[1][k] = ((k + cnt * VSZ) / trueWgSize[0]) % trueWgSize[1];
- w->workitemid[2][k] = (k + cnt * VSZ) / (trueWgSize[0] * trueWgSize[1]);
+ for (int k = 0; k < wfSize(); ++k) {
+ w->workitemid[0][k] = (k+cnt*wfSize()) % trueWgSize[0];
+ w->workitemid[1][k] =
+ ((k + cnt * wfSize()) / trueWgSize[0]) % trueWgSize[1];
+ w->workitemid[2][k] =
+ (k + cnt * wfSize()) / (trueWgSize[0] * trueWgSize[1]);
w->workitemFlatId[k] = w->workitemid[2][k] * trueWgSize[0] *
trueWgSize[1] + w->workitemid[1][k] * trueWgSize[0] +
@@ -277,9 +292,9 @@ ComputeUnit::StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[],
w->old_barrier_cnt = wfCtx->old_barrier_cnt;
w->barrier_cnt = wfCtx->barrier_cnt;
- w->barrier_slots = divCeil(trueWgSizeTotal, VSZ);
+ w->barrier_slots = divCeil(trueWgSizeTotal, wfSize());
- for (int i = 0; i < VSZ; ++i) {
+ for (int i = 0; i < wfSize(); ++i) {
w->bar_cnt[i] = wfCtx->bar_cnt[i];
}
@@ -315,16 +330,17 @@ ComputeUnit::StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[],
// is this the last wavefront in the workgroup
// if set the spillWidth to be the remaining work-items
// so that the vector access is correct
- if ((cnt + 1) * VSZ >= trueWgSizeTotal) {
- w->spillWidth = trueWgSizeTotal - (cnt * VSZ);
+ if ((cnt + 1) * wfSize() >= trueWgSizeTotal) {
+ w->spillWidth = trueWgSizeTotal - (cnt * wfSize());
} else {
- w->spillWidth = VSZ;
+ w->spillWidth = wfSize();
}
DPRINTF(GPUDisp, "Scheduling wfDynId/barrier_id %d/%d on CU%d: "
"WF[%d][%d]\n", _n_wave, barrier_id, cu_id, w->simdId, w->wfSlotId);
w->start(++_n_wave, ndr->q.code_ptr);
+ wfCtx->bar_cnt.clear();
}
void
@@ -339,7 +355,7 @@ ComputeUnit::StartWorkgroup(NDRange *ndr)
// Send L1 cache acquire
// isKernel + isAcquire = Kernel Begin
if (shader->impl_kern_boundary_sync) {
- GPUDynInstPtr gpuDynInst = std::make_shared<GPUDynInst>(nullptr,
+ GPUDynInstPtr gpuDynInst = std::make_shared<GPUDynInst>(this,
nullptr,
nullptr, 0);
@@ -374,7 +390,7 @@ ComputeUnit::StartWorkgroup(NDRange *ndr)
if (w->status == Wavefront::S_STOPPED) {
// if we have scheduled all work items then stop
// scheduling wavefronts
- if (cnt * VSZ >= trueWgSizeTotal)
+ if (cnt * wfSize() >= trueWgSizeTotal)
break;
// reserve vector registers for the scheduled wavefront
@@ -420,7 +436,7 @@ ComputeUnit::ReadyWorkgroup(NDRange *ndr)
// work item of the work group
int vregDemandPerWI = ndr->q.sRegCount + (2 * ndr->q.dRegCount);
bool vregAvail = true;
- int numWfs = (trueWgSizeTotal + VSZ - 1) / VSZ;
+ int numWfs = (trueWgSizeTotal + wfSize() - 1) / wfSize();
int freeWfSlots = 0;
// check if the total number of VGPRs required by all WFs of the WG
// fit in the VRFs of all SIMD units
@@ -623,7 +639,7 @@ ComputeUnit::init()
// Setup space for call args
for (int j = 0; j < numSIMDs; ++j) {
for (int i = 0; i < shader->n_wf; ++i) {
- wfList[j][i]->initCallArgMem(shader->funcargs_size);
+ wfList[j][i]->initCallArgMem(shader->funcargs_size, wavefrontSize);
}
}
@@ -1193,15 +1209,15 @@ ComputeUnit::DTLBPort::recvTimingResp(PacketPtr pkt)
Addr last = 0;
switch(computeUnit->prefetchType) {
- case Enums::PF_CU:
+ case Enums::PF_CU:
last = computeUnit->lastVaddrCU[mp_index];
break;
- case Enums::PF_PHASE:
- last = computeUnit->lastVaddrPhase[simdId][mp_index];
+ case Enums::PF_PHASE:
+ last = computeUnit->lastVaddrSimd[simdId][mp_index];
break;
- case Enums::PF_WF:
+ case Enums::PF_WF:
last = computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index];
- default:
+ default:
break;
}
@@ -1215,7 +1231,7 @@ ComputeUnit::DTLBPort::recvTimingResp(PacketPtr pkt)
DPRINTF(GPUPrefetch, "Stride is %d\n", stride);
computeUnit->lastVaddrCU[mp_index] = vaddr;
- computeUnit->lastVaddrPhase[simdId][mp_index] = vaddr;
+ computeUnit->lastVaddrSimd[simdId][mp_index] = vaddr;
computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index] = vaddr;
stride = (computeUnit->prefetchType == Enums::PF_STRIDE) ?
@@ -1488,7 +1504,7 @@ ComputeUnit::regStats()
;
ldsBankConflictDist
- .init(0, VSZ, 2)
+ .init(0, wfSize(), 2)
.name(name() + ".lds_bank_conflicts")
.desc("Number of bank conflicts per LDS memory packet")
;
@@ -1499,27 +1515,28 @@ ComputeUnit::regStats()
;
pageDivergenceDist
- // A wavefront can touch 1 to VSZ pages per memory instruction.
- // The number of pages per bin can be configured (here it's 4).
- .init(1, VSZ, 4)
+ // A wavefront can touch up to N pages per memory instruction where
+ // N is equal to the wavefront size
+ // The number of pages per bin can be configured (here it's 4).
+ .init(1, wfSize(), 4)
.name(name() + ".page_divergence_dist")
.desc("pages touched per wf (over all mem. instr.)")
;
controlFlowDivergenceDist
- .init(1, VSZ, 4)
+ .init(1, wfSize(), 4)
.name(name() + ".warp_execution_dist")
.desc("number of lanes active per instruction (oval all instructions)")
;
activeLanesPerGMemInstrDist
- .init(1, VSZ, 4)
+ .init(1, wfSize(), 4)
.name(name() + ".gmem_lanes_execution_dist")
.desc("number of active lanes per global memory instruction")
;
activeLanesPerLMemInstrDist
- .init(1, VSZ, 4)
+ .init(1, wfSize(), 4)
.name(name() + ".lmem_lanes_execution_dist")
.desc("number of active lanes per local memory instruction")
;
@@ -1531,7 +1548,7 @@ ComputeUnit::regStats()
numVecOpsExecuted
.name(name() + ".num_vec_ops_executed")
- .desc("number of vec ops executed (e.g. VSZ/inst)")
+ .desc("number of vec ops executed (e.g. WF size/inst)")
;
totalCycles