3 files changed, 30 insertions, 130 deletions
diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc
index 5ec061172..83e2414db 100644
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -193,50 +193,6 @@ ComputeUnit::FillKernelState(Wavefront *w, NDRange *ndr)
 }
 
 void
-ComputeUnit::InitializeWFContext(WFContext *wfCtx, NDRange *ndr, int cnt,
-                        int trueWgSize[], int trueWgSizeTotal,
-                        LdsChunk *ldsChunk, uint64_t origSpillMemStart)
-{
-    wfCtx->cnt = cnt;
-
-    VectorMask init_mask;
-    init_mask.reset();
-
-    for (int k = 0; k < wfSize(); ++k) {
-        if (k + cnt * wfSize() < trueWgSizeTotal)
-            init_mask[k] = 1;
-    }
-
-    wfCtx->init_mask = init_mask.to_ullong();
-    wfCtx->exec_mask = init_mask.to_ullong();
-
-    wfCtx->bar_cnt.resize(wfSize(), 0);
-
-    wfCtx->max_bar_cnt = 0;
-    wfCtx->old_barrier_cnt = 0;
-    wfCtx->barrier_cnt = 0;
-
-    wfCtx->privBase = ndr->q.privMemStart;
-    ndr->q.privMemStart += ndr->q.privMemPerItem * wfSize();
-
-    wfCtx->spillBase = ndr->q.spillMemStart;
-    ndr->q.spillMemStart += ndr->q.spillMemPerItem * wfSize();
-
-    wfCtx->pc = 0;
-    wfCtx->rpc = UINT32_MAX;
-
-    // set the wavefront context to have a pointer to this section of the LDS
-    wfCtx->ldsChunk = ldsChunk;
-
-    // WG state
-    wfCtx->wg_id = ndr->globalWgId;
-    wfCtx->barrier_id = barrier_id;
-
-    // Kernel wide state
-    wfCtx->ndr = ndr;
-}
-
-void
 ComputeUnit::updateEvents() {
 
     if (!timestampVec.empty()) {
@@ -264,19 +220,25 @@ ComputeUnit::updateEvents() {
 
 
 void
-ComputeUnit::StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[],
-                     int trueWgSizeTotal)
+ComputeUnit::StartWF(Wavefront *w, int trueWgSize[], int trueWgSizeTotal,
+                     int cnt, LdsChunk *ldsChunk, NDRange *ndr)
 {
     static int _n_wave = 0;
-    int cnt = wfCtx->cnt;
-    NDRange *ndr = wfCtx->ndr;
 
     // Fill in Kernel state
     FillKernelState(w, ndr);
 
+    VectorMask init_mask;
+    init_mask.reset();
+
+    for (int k = 0; k < wfSize(); ++k) {
+        if (k + cnt * wfSize() < trueWgSizeTotal)
+            init_mask[k] = 1;
+    }
+
     w->kern_id = ndr->dispatchId;
     w->dynwaveid = cnt;
-    w->init_mask = wfCtx->init_mask;
+    w->init_mask = init_mask.to_ullong();
 
     for (int k = 0; k < wfSize(); ++k) {
         w->workitemid[0][k] = (k+cnt*wfSize()) % trueWgSize[0];
@@ -290,32 +252,34 @@ ComputeUnit::StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[],
             w->workitemid[0][k];
     }
 
-    w->old_barrier_cnt = wfCtx->old_barrier_cnt;
-    w->barrier_cnt = wfCtx->barrier_cnt;
     w->barrier_slots = divCeil(trueWgSizeTotal, wfSize());
 
-    for (int i = 0; i < wfSize(); ++i) {
-        w->bar_cnt[i] = wfCtx->bar_cnt[i];
-    }
+    w->bar_cnt.resize(wfSize(), 0);
+
+    w->max_bar_cnt = 0;
+    w->old_barrier_cnt = 0;
+    w->barrier_cnt = 0;
+
+    w->privBase = ndr->q.privMemStart;
+    ndr->q.privMemStart += ndr->q.privMemPerItem * wfSize();
 
-    w->max_bar_cnt = wfCtx->max_bar_cnt;
-    w->privBase = wfCtx->privBase;
-    w->spillBase = wfCtx->spillBase;
+    w->spillBase = ndr->q.spillMemStart;
+    ndr->q.spillMemStart += ndr->q.spillMemPerItem * wfSize();
 
-    w->pushToReconvergenceStack(wfCtx->pc, wfCtx->rpc, wfCtx->exec_mask);
+    w->pushToReconvergenceStack(0, UINT32_MAX, init_mask.to_ulong());
 
     // WG state
-    w->wg_id = wfCtx->wg_id;
-    w->dispatchid = wfCtx->ndr->dispatchId;
+    w->wg_id = ndr->globalWgId;
+    w->dispatchid = ndr->dispatchId;
     w->workgroupid[0] = w->wg_id % ndr->numWg[0];
     w->workgroupid[1] = (w->wg_id / ndr->numWg[0]) % ndr->numWg[1];
     w->workgroupid[2] = w->wg_id / (ndr->numWg[0] * ndr->numWg[1]);
 
-    w->barrier_id = wfCtx->barrier_id;
+    w->barrier_id = barrier_id;
     w->stalledAtBarrier = false;
 
-    // move this from the context into the actual wavefront
-    w->ldsChunk = wfCtx->ldsChunk;
+    // set the wavefront context to have a pointer to this section of the LDS
+    w->ldsChunk = ldsChunk;
 
     int32_t refCount M5_VAR_USED =
                     lds.increaseRefCounter(w->dispatchid, w->wg_id);
@@ -340,7 +304,6 @@ ComputeUnit::StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[],
             "WF[%d][%d]\n", _n_wave, barrier_id, cu_id, w->simdId, w->wfSlotId);
 
     w->start(++_n_wave, ndr->q.code_ptr);
-    wfCtx->bar_cnt.clear();
 }
 
 void
@@ -376,7 +339,6 @@ ComputeUnit::StartWorkgroup(NDRange *ndr)
         trueWgSizeTotal *= trueWgSize[d];
     }
 
-    uint64_t origSpillMemStart = ndr->q.spillMemStart;
     // calculate the number of 32-bit vector registers required by wavefront
     int vregDemand = ndr->q.sRegCount + (2 * ndr->q.dRegCount);
     int cnt = 0;
@@ -403,12 +365,7 @@ ComputeUnit::StartWorkgroup(NDRange *ndr)
             w->reservedVectorRegs = normSize;
             vectorRegsReserved[m % numSIMDs] += w->reservedVectorRegs;
 
-            WFContext wfCtx;
-
-            InitializeWFContext(&wfCtx, ndr, cnt, trueWgSize, trueWgSizeTotal,
-                                ldsChunk, origSpillMemStart);
-
-            StartWF(w, &wfCtx, trueWgSize, trueWgSizeTotal);
+            StartWF(w, trueWgSize, trueWgSizeTotal, cnt, ldsChunk, ndr);
             ++cnt;
         }
     }
diff --git a/src/gpu-compute/compute_unit.hh b/src/gpu-compute/compute_unit.hh
index a234cbeb5..34b710cd6 100644
--- a/src/gpu-compute/compute_unit.hh
+++ b/src/gpu-compute/compute_unit.hh
@@ -256,12 +256,8 @@ class ComputeUnit : public MemObject
     void fetch(PacketPtr pkt, Wavefront *wavefront);
     void FillKernelState(Wavefront *w, NDRange *ndr);
 
-    void StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[],
-                 int trueWgSizeTotal);
-
-    void InitializeWFContext(WFContext *wfCtx, NDRange *ndr, int cnt,
-                             int trueWgSize[], int trueWgSizeTotal,
-                             LdsChunk *ldsChunk, uint64_t origSpillMemStart);
+    void StartWF(Wavefront *w, int trueWgSize[], int trueWgSizeTotal,
+                     int cnt, LdsChunk *ldsChunk, NDRange *ndr);
 
     void StartWorkgroup(NDRange *ndr);
     int ReadyWorkgroup(NDRange *ndr);
diff --git a/src/gpu-compute/qstruct.hh b/src/gpu-compute/qstruct.hh
index 7bca757b8..b400dc0ee 100644
--- a/src/gpu-compute/qstruct.hh
+++ b/src/gpu-compute/qstruct.hh
@@ -95,59 +95,6 @@ struct HsaQueueEntry
     uint16_t num_args;
 };
 
-// State used to start (or restart) a WF
-struct WFContext
-{
-    // 32 bit values
-    // barrier state
-    std::vector<int> bar_cnt;
-
-    // id (which WF in the WG)
-    int cnt;
-
-    // more barrier state
-    int max_bar_cnt;
-    int old_barrier_cnt;
-    int barrier_cnt;
-
-    // More Program Counter Stuff
-    uint32_t pc;
-
-    // Program counter of the immediate post-dominator instruction
-    uint32_t rpc;
-
-    // WG wide state (I don't see how to avoid redundancy here)
-    int cu_id;
-    uint32_t wg_id;
-    uint32_t barrier_id;
-
-    // 64 bit values (these values depend on the wavefront size)
-    // masks
-    uint64_t init_mask;
-    uint64_t exec_mask;
-
-    // private memory;
-    Addr privBase;
-    Addr spillBase;
-
-    LdsChunk *ldsChunk;
-
-    /*
-     * Kernel wide state
-     * This is a hack. This state should be moved through simulated memory
-     * during a yield. Though not much is being used here, so it's probably
-     * probably not a big deal.
-     *
-     * Just to add to this comment... The ndr is derived from simulated
-     * memory when the cl-runtime allocates an HsaQueueEntry and populates it
-     * for a kernel launch. So in theory the runtime should be able to keep
-     * that state around. Then a WF can reference it upon restart to derive
-     * kernel wide state. The runtime can deallocate the state when the
-     * kernel completes.
-     */
-    NDRange *ndr;
-};
-
 // State that needs to be passed between the simulation and simulated app, a
 // pointer to this struct can be passed through the depends field in the
 // HsaQueueEntry struct