summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/gpu-compute/compute_unit.cc99
-rw-r--r--src/gpu-compute/compute_unit.hh8
-rw-r--r--src/gpu-compute/qstruct.hh53
3 files changed, 30 insertions, 130 deletions
diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc
index 5ec061172..83e2414db 100644
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -193,50 +193,6 @@ ComputeUnit::FillKernelState(Wavefront *w, NDRange *ndr)
}
void
-ComputeUnit::InitializeWFContext(WFContext *wfCtx, NDRange *ndr, int cnt,
- int trueWgSize[], int trueWgSizeTotal,
- LdsChunk *ldsChunk, uint64_t origSpillMemStart)
-{
- wfCtx->cnt = cnt;
-
- VectorMask init_mask;
- init_mask.reset();
-
- for (int k = 0; k < wfSize(); ++k) {
- if (k + cnt * wfSize() < trueWgSizeTotal)
- init_mask[k] = 1;
- }
-
- wfCtx->init_mask = init_mask.to_ullong();
- wfCtx->exec_mask = init_mask.to_ullong();
-
- wfCtx->bar_cnt.resize(wfSize(), 0);
-
- wfCtx->max_bar_cnt = 0;
- wfCtx->old_barrier_cnt = 0;
- wfCtx->barrier_cnt = 0;
-
- wfCtx->privBase = ndr->q.privMemStart;
- ndr->q.privMemStart += ndr->q.privMemPerItem * wfSize();
-
- wfCtx->spillBase = ndr->q.spillMemStart;
- ndr->q.spillMemStart += ndr->q.spillMemPerItem * wfSize();
-
- wfCtx->pc = 0;
- wfCtx->rpc = UINT32_MAX;
-
- // set the wavefront context to have a pointer to this section of the LDS
- wfCtx->ldsChunk = ldsChunk;
-
- // WG state
- wfCtx->wg_id = ndr->globalWgId;
- wfCtx->barrier_id = barrier_id;
-
- // Kernel wide state
- wfCtx->ndr = ndr;
-}
-
-void
ComputeUnit::updateEvents() {
if (!timestampVec.empty()) {
@@ -264,19 +220,25 @@ ComputeUnit::updateEvents() {
void
-ComputeUnit::StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[],
- int trueWgSizeTotal)
+ComputeUnit::StartWF(Wavefront *w, int trueWgSize[], int trueWgSizeTotal,
+ int cnt, LdsChunk *ldsChunk, NDRange *ndr)
{
static int _n_wave = 0;
- int cnt = wfCtx->cnt;
- NDRange *ndr = wfCtx->ndr;
// Fill in Kernel state
FillKernelState(w, ndr);
+ VectorMask init_mask;
+ init_mask.reset();
+
+ for (int k = 0; k < wfSize(); ++k) {
+ if (k + cnt * wfSize() < trueWgSizeTotal)
+ init_mask[k] = 1;
+ }
+
w->kern_id = ndr->dispatchId;
w->dynwaveid = cnt;
- w->init_mask = wfCtx->init_mask;
+ w->init_mask = init_mask.to_ullong();
for (int k = 0; k < wfSize(); ++k) {
w->workitemid[0][k] = (k+cnt*wfSize()) % trueWgSize[0];
@@ -290,32 +252,34 @@ ComputeUnit::StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[],
w->workitemid[0][k];
}
- w->old_barrier_cnt = wfCtx->old_barrier_cnt;
- w->barrier_cnt = wfCtx->barrier_cnt;
w->barrier_slots = divCeil(trueWgSizeTotal, wfSize());
- for (int i = 0; i < wfSize(); ++i) {
- w->bar_cnt[i] = wfCtx->bar_cnt[i];
- }
+ w->bar_cnt.resize(wfSize(), 0);
+
+ w->max_bar_cnt = 0;
+ w->old_barrier_cnt = 0;
+ w->barrier_cnt = 0;
+
+ w->privBase = ndr->q.privMemStart;
+ ndr->q.privMemStart += ndr->q.privMemPerItem * wfSize();
- w->max_bar_cnt = wfCtx->max_bar_cnt;
- w->privBase = wfCtx->privBase;
- w->spillBase = wfCtx->spillBase;
+ w->spillBase = ndr->q.spillMemStart;
+ ndr->q.spillMemStart += ndr->q.spillMemPerItem * wfSize();
- w->pushToReconvergenceStack(wfCtx->pc, wfCtx->rpc, wfCtx->exec_mask);
+ w->pushToReconvergenceStack(0, UINT32_MAX, init_mask.to_ulong());
// WG state
- w->wg_id = wfCtx->wg_id;
- w->dispatchid = wfCtx->ndr->dispatchId;
+ w->wg_id = ndr->globalWgId;
+ w->dispatchid = ndr->dispatchId;
w->workgroupid[0] = w->wg_id % ndr->numWg[0];
w->workgroupid[1] = (w->wg_id / ndr->numWg[0]) % ndr->numWg[1];
w->workgroupid[2] = w->wg_id / (ndr->numWg[0] * ndr->numWg[1]);
- w->barrier_id = wfCtx->barrier_id;
+ w->barrier_id = barrier_id;
w->stalledAtBarrier = false;
- // move this from the context into the actual wavefront
- w->ldsChunk = wfCtx->ldsChunk;
+ // set the wavefront context to have a pointer to this section of the LDS
+ w->ldsChunk = ldsChunk;
int32_t refCount M5_VAR_USED =
lds.increaseRefCounter(w->dispatchid, w->wg_id);
@@ -340,7 +304,6 @@ ComputeUnit::StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[],
"WF[%d][%d]\n", _n_wave, barrier_id, cu_id, w->simdId, w->wfSlotId);
w->start(++_n_wave, ndr->q.code_ptr);
- wfCtx->bar_cnt.clear();
}
void
@@ -376,7 +339,6 @@ ComputeUnit::StartWorkgroup(NDRange *ndr)
trueWgSizeTotal *= trueWgSize[d];
}
- uint64_t origSpillMemStart = ndr->q.spillMemStart;
// calculate the number of 32-bit vector registers required by wavefront
int vregDemand = ndr->q.sRegCount + (2 * ndr->q.dRegCount);
int cnt = 0;
@@ -403,12 +365,7 @@ ComputeUnit::StartWorkgroup(NDRange *ndr)
w->reservedVectorRegs = normSize;
vectorRegsReserved[m % numSIMDs] += w->reservedVectorRegs;
- WFContext wfCtx;
-
- InitializeWFContext(&wfCtx, ndr, cnt, trueWgSize, trueWgSizeTotal,
- ldsChunk, origSpillMemStart);
-
- StartWF(w, &wfCtx, trueWgSize, trueWgSizeTotal);
+ StartWF(w, trueWgSize, trueWgSizeTotal, cnt, ldsChunk, ndr);
++cnt;
}
}
diff --git a/src/gpu-compute/compute_unit.hh b/src/gpu-compute/compute_unit.hh
index a234cbeb5..34b710cd6 100644
--- a/src/gpu-compute/compute_unit.hh
+++ b/src/gpu-compute/compute_unit.hh
@@ -256,12 +256,8 @@ class ComputeUnit : public MemObject
void fetch(PacketPtr pkt, Wavefront *wavefront);
void FillKernelState(Wavefront *w, NDRange *ndr);
- void StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[],
- int trueWgSizeTotal);
-
- void InitializeWFContext(WFContext *wfCtx, NDRange *ndr, int cnt,
- int trueWgSize[], int trueWgSizeTotal,
- LdsChunk *ldsChunk, uint64_t origSpillMemStart);
+ void StartWF(Wavefront *w, int trueWgSize[], int trueWgSizeTotal,
+ int cnt, LdsChunk *ldsChunk, NDRange *ndr);
void StartWorkgroup(NDRange *ndr);
int ReadyWorkgroup(NDRange *ndr);
diff --git a/src/gpu-compute/qstruct.hh b/src/gpu-compute/qstruct.hh
index 7bca757b8..b400dc0ee 100644
--- a/src/gpu-compute/qstruct.hh
+++ b/src/gpu-compute/qstruct.hh
@@ -95,59 +95,6 @@ struct HsaQueueEntry
uint16_t num_args;
};
-// State used to start (or restart) a WF
-struct WFContext
-{
- // 32 bit values
- // barrier state
- std::vector<int> bar_cnt;
-
- // id (which WF in the WG)
- int cnt;
-
- // more barrier state
- int max_bar_cnt;
- int old_barrier_cnt;
- int barrier_cnt;
-
- // More Program Counter Stuff
- uint32_t pc;
-
- // Program counter of the immediate post-dominator instruction
- uint32_t rpc;
-
- // WG wide state (I don't see how to avoid redundancy here)
- int cu_id;
- uint32_t wg_id;
- uint32_t barrier_id;
-
- // 64 bit values (these values depend on the wavefront size)
- // masks
- uint64_t init_mask;
- uint64_t exec_mask;
-
- // private memory;
- Addr privBase;
- Addr spillBase;
-
- LdsChunk *ldsChunk;
-
- /*
- * Kernel wide state
- * This is a hack. This state should be moved through simulated memory
- * during a yield. Though not much is being used here, so it's probably
- * probably not a big deal.
- *
- * Just to add to this comment... The ndr is derived from simulated
- * memory when the cl-runtime allocates an HsaQueueEntry and populates it
- * for a kernel launch. So in theory the runtime should be able to keep
- * that state around. Then a WF can reference it upon restart to derive
- * kernel wide state. The runtime can deallocate the state when the
- * kernel completes.
- */
- NDRange *ndr;
-};
-
// State that needs to be passed between the simulation and simulated app, a
// pointer to this struct can be passed through the depends field in the
// HsaQueueEntry struct