gpu-compute: parametrize Wavefront size

Eliminate the VSZ constant that defined the Wavefront size (in numbers of work items); replaced it with a parameter in the GPU.py configuration script. Changed all data structures dependent on the Wavefront size to be dynamically sized. Legal values of Wavefront size are 16, 32, 64 for now and checked at initialization time.
author: jkalamat <john.kalamatianos@amd.com> 2016-06-09 11:24:55 -0400
committer: jkalamat <john.kalamatianos@amd.com> 2016-06-09 11:24:55 -0400
commit: 3724fb15faafaaca54cc7a500df9c1490a387049 (patch)
tree: bbd671b68ba971087a1cd45b208947c09a622d38 /src/gpu-compute
parent: e5b7b6780f9748b6f13ef91e3e22d53ebdf47968 (diff)
download: gem5-3724fb15faafaaca54cc7a500df9c1490a387049.tar.xz
17 files changed, 151 insertions, 114 deletions
diff --git a/src/gpu-compute/GPU.py b/src/gpu-compute/GPU.py
index bd95f6335..f580a09f7 100644
--- a/src/gpu-compute/GPU.py
+++ b/src/gpu-compute/GPU.py
@@ -59,6 +59,7 @@ class VectorRegisterFile(SimObject):
 
     simd_id = Param.Int(0, 'SIMD ID associated with this VRF')
     num_regs_per_simd = Param.Int(2048, 'number of vector registers per SIMD')
+    wfSize = Param.Int(64, 'Wavefront size (in work items)')
     min_alloc = Param.Int(4, 'min number of VGPRs allocated per WF')
 
 class Wavefront(SimObject):
@@ -68,6 +69,7 @@ class Wavefront(SimObject):
 
     simdId = Param.Int('SIMD id (0-ComputeUnit.num_SIMDs)')
     wf_slot_id = Param.Int('wavefront id (0-ComputeUnit.max_wfs)')
+    wfSize = Param.Int(64, 'Wavefront size (in work items)')
 
 class ComputeUnit(MemObject):
     type = 'ComputeUnit'
diff --git a/src/gpu-compute/cl_driver.cc b/src/gpu-compute/cl_driver.cc
index 3b3291c03..6bb6be102 100644
--- a/src/gpu-compute/cl_driver.cc
+++ b/src/gpu-compute/cl_driver.cc
@@ -238,7 +238,7 @@ ClDriver::ioctl(LiveProcess *process, ThreadContext *tc, unsigned req)
       case HSA_GET_VSZ:
         {
             BufferArg buf(buf_addr, sizeof(uint32_t));
-            *((uint32_t*)buf.bufferPtr()) = VSZ;
+            *((uint32_t*)buf.bufferPtr()) = dispatcher->wfSize();
             buf.copyOut(tc->getMemProxy());
         }
         break;
diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc
index b3a99b182..5ec061172 100644
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -32,9 +32,10 @@
  *
  * Author: John Kalamatianos, Anthony Gutierrez
  */
-
 #include "gpu-compute/compute_unit.hh"
 
+#include <limits>
+
 #include "base/output.hh"
 #include "debug/GPUDisp.hh"
 #include "debug/GPUExec.hh"
@@ -76,14 +77,27 @@ ComputeUnit::ComputeUnit(const Params *p) : MemObject(p), fetchStage(p),
     _masterId(p->system->getMasterId(name() + ".ComputeUnit")),
     lds(*p->localDataStore), globalSeqNum(0),  wavefrontSize(p->wfSize)
 {
-    // this check will be eliminated once we have wavefront size support added
-    fatal_if(p->wfSize != VSZ, "Wavefront size parameter does not match VSZ");
+    /**
+     * This check is necessary because std::bitset only provides conversion
+     * to unsigned long or unsigned long long via to_ulong() or to_ullong().
+     * there are * a few places in the code where to_ullong() is used, however
+     * if VSZ is larger than a value the host can support then bitset will
+     * throw a runtime exception. we should remove all use of to_long() or
+     * to_ullong() so we can have VSZ greater than 64b, however until that is
+     * done this assert is required.
+     */
+    fatal_if(p->wfSize > std::numeric_limits<unsigned long long>::digits ||
+             p->wfSize <= 0,
+             "WF size is larger than the host can support");
+    fatal_if(!isPowerOf2(wavefrontSize),
+             "Wavefront size should be a power of 2");
     // calculate how many cycles a vector load or store will need to transfer
     // its data over the corresponding buses
-    numCyclesPerStoreTransfer = (uint32_t)ceil((double)(VSZ * sizeof(uint32_t))
-                                / (double)vrfToCoalescerBusWidth);
+    numCyclesPerStoreTransfer =
+        (uint32_t)ceil((double)(wfSize() * sizeof(uint32_t)) /
+                (double)vrfToCoalescerBusWidth);
 
-    numCyclesPerLoadTransfer = (VSZ * sizeof(uint32_t))
+    numCyclesPerLoadTransfer = (wfSize() * sizeof(uint32_t))
                                / coalescerToVrfBusWidth;
 
     lastVaddrWF.resize(numSIMDs);
@@ -93,24 +107,24 @@ ComputeUnit::ComputeUnit(const Params *p) : MemObject(p), fetchStage(p),
         lastVaddrWF[j].resize(p->n_wf);
 
         for (int i = 0; i < p->n_wf; ++i) {
-            lastVaddrWF[j][i].resize(VSZ);
+            lastVaddrWF[j][i].resize(wfSize());
 
             wfList[j].push_back(p->wavefronts[j * p->n_wf + i]);
             wfList[j][i]->setParent(this);
 
-            for (int k = 0; k < VSZ; ++k) {
+            for (int k = 0; k < wfSize(); ++k) {
                 lastVaddrWF[j][i][k] = 0;
             }
         }
     }
 
-    lastVaddrPhase.resize(numSIMDs);
+    lastVaddrSimd.resize(numSIMDs);
 
     for (int i = 0; i < numSIMDs; ++i) {
-        lastVaddrPhase[i] = LastVaddrWave();
+        lastVaddrSimd[i].resize(wfSize(), 0);
     }
 
-    lastVaddrCU = LastVaddrWave();
+    lastVaddrCU.resize(wfSize());
 
     lds.setParent(this);
 
@@ -122,10 +136,10 @@ ComputeUnit::ComputeUnit(const Params *p) : MemObject(p), fetchStage(p),
         fatal("Invalid WF execution policy (CU)\n");
     }
 
-    memPort.resize(VSZ);
+    memPort.resize(wfSize());
 
     // resize the tlbPort vectorArray
-    int tlbPort_width = perLaneTLB ? VSZ : 1;
+    int tlbPort_width = perLaneTLB ? wfSize() : 1;
     tlbPort.resize(tlbPort_width);
 
     cuExitCallback = new CUExitCallback(this);
@@ -144,12 +158,13 @@ ComputeUnit::ComputeUnit(const Params *p) : MemObject(p), fetchStage(p),
 ComputeUnit::~ComputeUnit()
 {
     // Delete wavefront slots
-
-    for (int j = 0; j < numSIMDs; ++j)
+    for (int j = 0; j < numSIMDs; ++j) {
         for (int i = 0; i < shader->n_wf; ++i) {
             delete wfList[j][i];
         }
-
+        lastVaddrSimd[j].clear();
+    }
+    lastVaddrCU.clear();
     readyList.clear();
     waveStatusList.clear();
     dispatchList.clear();
@@ -187,27 +202,25 @@ ComputeUnit::InitializeWFContext(WFContext *wfCtx, NDRange *ndr, int cnt,
     VectorMask init_mask;
     init_mask.reset();
 
-    for (int k = 0; k < VSZ; ++k) {
-        if (k + cnt * VSZ < trueWgSizeTotal)
+    for (int k = 0; k < wfSize(); ++k) {
+        if (k + cnt * wfSize() < trueWgSizeTotal)
             init_mask[k] = 1;
     }
 
     wfCtx->init_mask = init_mask.to_ullong();
     wfCtx->exec_mask = init_mask.to_ullong();
 
-    for (int i = 0; i < VSZ; ++i) {
-        wfCtx->bar_cnt[i] = 0;
-    }
+    wfCtx->bar_cnt.resize(wfSize(), 0);
 
     wfCtx->max_bar_cnt = 0;
     wfCtx->old_barrier_cnt = 0;
     wfCtx->barrier_cnt = 0;
 
     wfCtx->privBase = ndr->q.privMemStart;
-    ndr->q.privMemStart += ndr->q.privMemPerItem * VSZ;
+    ndr->q.privMemStart += ndr->q.privMemPerItem * wfSize();
 
     wfCtx->spillBase = ndr->q.spillMemStart;
-    ndr->q.spillMemStart += ndr->q.spillMemPerItem * VSZ;
+    ndr->q.spillMemStart += ndr->q.spillMemPerItem * wfSize();
 
     wfCtx->pc = 0;
     wfCtx->rpc = UINT32_MAX;
@@ -265,10 +278,12 @@ ComputeUnit::StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[],
     w->dynwaveid = cnt;
     w->init_mask = wfCtx->init_mask;
 
-    for (int k = 0; k < VSZ; ++k) {
-        w->workitemid[0][k] = (k+cnt*VSZ) % trueWgSize[0];
-        w->workitemid[1][k] = ((k + cnt * VSZ) / trueWgSize[0]) % trueWgSize[1];
-        w->workitemid[2][k] = (k + cnt * VSZ) / (trueWgSize[0] * trueWgSize[1]);
+    for (int k = 0; k < wfSize(); ++k) {
+        w->workitemid[0][k] = (k+cnt*wfSize()) % trueWgSize[0];
+        w->workitemid[1][k] =
+            ((k + cnt * wfSize()) / trueWgSize[0]) % trueWgSize[1];
+        w->workitemid[2][k] =
+            (k + cnt * wfSize()) / (trueWgSize[0] * trueWgSize[1]);
 
         w->workitemFlatId[k] = w->workitemid[2][k] * trueWgSize[0] *
             trueWgSize[1] + w->workitemid[1][k] * trueWgSize[0] +
@@ -277,9 +292,9 @@ ComputeUnit::StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[],
 
     w->old_barrier_cnt = wfCtx->old_barrier_cnt;
     w->barrier_cnt = wfCtx->barrier_cnt;
-    w->barrier_slots = divCeil(trueWgSizeTotal, VSZ);
+    w->barrier_slots = divCeil(trueWgSizeTotal, wfSize());
 
-    for (int i = 0; i < VSZ; ++i) {
+    for (int i = 0; i < wfSize(); ++i) {
         w->bar_cnt[i] = wfCtx->bar_cnt[i];
     }
 
@@ -315,16 +330,17 @@ ComputeUnit::StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[],
     // is this the last wavefront in the workgroup
     // if set the spillWidth to be the remaining work-items
     // so that the vector access is correct
-    if ((cnt + 1) * VSZ >= trueWgSizeTotal) {
-        w->spillWidth = trueWgSizeTotal - (cnt * VSZ);
+    if ((cnt + 1) * wfSize() >= trueWgSizeTotal) {
+        w->spillWidth = trueWgSizeTotal - (cnt * wfSize());
     } else {
-        w->spillWidth = VSZ;
+        w->spillWidth = wfSize();
     }
 
     DPRINTF(GPUDisp, "Scheduling wfDynId/barrier_id %d/%d on CU%d: "
             "WF[%d][%d]\n", _n_wave, barrier_id, cu_id, w->simdId, w->wfSlotId);
 
     w->start(++_n_wave, ndr->q.code_ptr);
+    wfCtx->bar_cnt.clear();
 }
 
 void
@@ -339,7 +355,7 @@ ComputeUnit::StartWorkgroup(NDRange *ndr)
     // Send L1 cache acquire
     // isKernel + isAcquire = Kernel Begin
     if (shader->impl_kern_boundary_sync) {
-        GPUDynInstPtr gpuDynInst = std::make_shared<GPUDynInst>(nullptr,
+        GPUDynInstPtr gpuDynInst = std::make_shared<GPUDynInst>(this,
                                                                 nullptr,
                                                                 nullptr, 0);
 
@@ -374,7 +390,7 @@ ComputeUnit::StartWorkgroup(NDRange *ndr)
         if (w->status == Wavefront::S_STOPPED) {
             // if we have scheduled all work items then stop
             // scheduling wavefronts
-            if (cnt * VSZ >= trueWgSizeTotal)
+            if (cnt * wfSize() >= trueWgSizeTotal)
                 break;
 
             // reserve vector registers for the scheduled wavefront
@@ -420,7 +436,7 @@ ComputeUnit::ReadyWorkgroup(NDRange *ndr)
     // work item of the work group
     int vregDemandPerWI = ndr->q.sRegCount + (2 * ndr->q.dRegCount);
     bool vregAvail = true;
-    int numWfs = (trueWgSizeTotal + VSZ - 1) / VSZ;
+    int numWfs = (trueWgSizeTotal + wfSize() - 1) / wfSize();
     int freeWfSlots = 0;
     // check if the total number of VGPRs required by all WFs of the WG
     // fit in the VRFs of all SIMD units
@@ -623,7 +639,7 @@ ComputeUnit::init()
     // Setup space for call args
     for (int j = 0; j < numSIMDs; ++j) {
         for (int i = 0; i < shader->n_wf; ++i) {
-            wfList[j][i]->initCallArgMem(shader->funcargs_size);
+            wfList[j][i]->initCallArgMem(shader->funcargs_size, wavefrontSize);
         }
     }
 
@@ -1193,15 +1209,15 @@ ComputeUnit::DTLBPort::recvTimingResp(PacketPtr pkt)
         Addr last = 0;
 
         switch(computeUnit->prefetchType) {
-          case Enums::PF_CU:
+        case Enums::PF_CU:
             last = computeUnit->lastVaddrCU[mp_index];
             break;
-          case Enums::PF_PHASE:
-            last = computeUnit->lastVaddrPhase[simdId][mp_index];
+        case Enums::PF_PHASE:
+            last = computeUnit->lastVaddrSimd[simdId][mp_index];
             break;
-          case Enums::PF_WF:
+        case Enums::PF_WF:
             last = computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index];
-          default:
+        default:
             break;
         }
 
@@ -1215,7 +1231,7 @@ ComputeUnit::DTLBPort::recvTimingResp(PacketPtr pkt)
         DPRINTF(GPUPrefetch, "Stride is %d\n", stride);
 
         computeUnit->lastVaddrCU[mp_index] = vaddr;
-        computeUnit->lastVaddrPhase[simdId][mp_index] = vaddr;
+        computeUnit->lastVaddrSimd[simdId][mp_index] = vaddr;
         computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index] = vaddr;
 
         stride = (computeUnit->prefetchType == Enums::PF_STRIDE) ?
@@ -1488,7 +1504,7 @@ ComputeUnit::regStats()
         ;
 
     ldsBankConflictDist
-       .init(0, VSZ, 2)
+       .init(0, wfSize(), 2)
        .name(name() + ".lds_bank_conflicts")
        .desc("Number of bank conflicts per LDS memory packet")
        ;
@@ -1499,27 +1515,28 @@ ComputeUnit::regStats()
         ;
 
     pageDivergenceDist
-       // A wavefront can touch 1 to VSZ pages per memory instruction.
-       // The number of pages per bin can be configured (here it's 4).
-       .init(1, VSZ, 4)
+        // A wavefront can touch up to N pages per memory instruction where
+        // N is equal to the wavefront size
+        // The number of pages per bin can be configured (here it's 4).
+       .init(1, wfSize(), 4)
        .name(name() + ".page_divergence_dist")
        .desc("pages touched per wf (over all mem. instr.)")
        ;
 
     controlFlowDivergenceDist
-        .init(1, VSZ, 4)
+        .init(1, wfSize(), 4)
         .name(name() + ".warp_execution_dist")
         .desc("number of lanes active per instruction (oval all instructions)")
         ;
 
     activeLanesPerGMemInstrDist
-        .init(1, VSZ, 4)
+        .init(1, wfSize(), 4)
         .name(name() + ".gmem_lanes_execution_dist")
         .desc("number of active lanes per global memory instruction")
         ;
 
     activeLanesPerLMemInstrDist
-        .init(1, VSZ, 4)
+        .init(1, wfSize(), 4)
         .name(name() + ".lmem_lanes_execution_dist")
         .desc("number of active lanes per local memory instruction")
         ;
@@ -1531,7 +1548,7 @@ ComputeUnit::regStats()
 
     numVecOpsExecuted
         .name(name() + ".num_vec_ops_executed")
-        .desc("number of vec ops executed (e.g. VSZ/inst)")
+        .desc("number of vec ops executed (e.g. WF size/inst)")
         ;
 
     totalCycles
diff --git a/src/gpu-compute/compute_unit.hh b/src/gpu-compute/compute_unit.hh
index f47c27a0a..a234cbeb5 100644
--- a/src/gpu-compute/compute_unit.hh
+++ b/src/gpu-compute/compute_unit.hh
@@ -161,22 +161,8 @@ class ComputeUnit : public MemObject
     // if fixed-stride prefetching, this is the stride.
     int prefetchStride;
 
-    class LastVaddrWave
-    {
-      public:
-        Addr vaddrs[VSZ];
-        Addr& operator[](int idx) {
-            return vaddrs[idx];
-        }
-
-        LastVaddrWave() {
-            for (int i = 0; i < VSZ; ++i)
-                vaddrs[i] = 0;
-        }
-    };
-
-    LastVaddrWave lastVaddrCU;
-    std::vector<LastVaddrWave> lastVaddrPhase;
+    std::vector<Addr> lastVaddrCU;
+    std::vector<std::vector<Addr>> lastVaddrSimd;
     std::vector<std::vector<std::vector<Addr>>> lastVaddrWF;
     Enums::PrefetchType prefetchType;
     EXEC_POLICY exec_policy;
diff --git a/src/gpu-compute/dispatcher.cc b/src/gpu-compute/dispatcher.cc
index 95c0c56a2..d1d011c0d 100644
--- a/src/gpu-compute/dispatcher.cc
+++ b/src/gpu-compute/dispatcher.cc
@@ -387,6 +387,12 @@ GpuDispatcher::getNumCUs()
     return shader->cuList.size();
 }
 
+int
+GpuDispatcher::wfSize() const
+{
+    return shader->cuList[0]->wfSize();
+}
+
 void
 GpuDispatcher::setFuncargsSize(int funcargs_size)
 {
diff --git a/src/gpu-compute/dispatcher.hh b/src/gpu-compute/dispatcher.hh
index 76f932655..e984af494 100644
--- a/src/gpu-compute/dispatcher.hh
+++ b/src/gpu-compute/dispatcher.hh
@@ -157,6 +157,7 @@ class GpuDispatcher : public DmaDevice
 
         // helper functions to retrieve/set GPU attributes
         int getNumCUs();
+        int wfSize() const;
         void setFuncargsSize(int funcargs_size);
 };
 
diff --git a/src/gpu-compute/global_memory_pipeline.cc b/src/gpu-compute/global_memory_pipeline.cc
index 355018666..a6a4d86db 100644
--- a/src/gpu-compute/global_memory_pipeline.cc
+++ b/src/gpu-compute/global_memory_pipeline.cc
@@ -179,9 +179,9 @@ GlobalMemPipeline::doGmReturn(GPUDynInstPtr m)
                 int physVgpr = w->remap(dst, sizeof(c0), 1);
                 // save the physical VGPR index
                 regVec.push_back(physVgpr);
-                c1 *p1 = &((c1*)m->d_data)[k * VSZ];
+                c1 *p1 = &((c1 *)m->d_data)[k * w->computeUnit->wfSize()];
 
-                for (int i = 0; i < VSZ; ++i) {
+                for (int i = 0; i < w->computeUnit->wfSize(); ++i) {
                     if (m->exec_mask[i]) {
                         DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: "
                                 "$%s%d <- %d global ld done (src = wavefront "
diff --git a/src/gpu-compute/gpu_dyn_inst.cc b/src/gpu-compute/gpu_dyn_inst.cc
index 2f35a983c..1806e79e4 100644
--- a/src/gpu-compute/gpu_dyn_inst.cc
+++ b/src/gpu-compute/gpu_dyn_inst.cc
@@ -42,11 +42,29 @@
 
 GPUDynInst::GPUDynInst(ComputeUnit *_cu, Wavefront *_wf,
                        GPUStaticInst *_staticInst, uint64_t instSeqNum)
-    : GPUExecContext(_cu, _wf), m_op(Enums::MO_UNDEF),
+    : GPUExecContext(_cu, _wf), addr(computeUnit()->wfSize(), (Addr)0),
+      m_op(Enums::MO_UNDEF),
       memoryOrder(Enums::MEMORY_ORDER_NONE), n_reg(0), useContinuation(false),
       statusBitVector(0), staticInst(_staticInst), _seqNum(instSeqNum)
 {
-    tlbHitLevel.assign(VSZ, -1);
+    tlbHitLevel.assign(computeUnit()->wfSize(), -1);
+    d_data = new uint8_t[computeUnit()->wfSize() * 16];
+    a_data = new uint8_t[computeUnit()->wfSize() * 8];
+    x_data = new uint8_t[computeUnit()->wfSize() * 8];
+    for (int i = 0; i < (computeUnit()->wfSize() * 8); ++i) {
+        a_data[i] = 0;
+        x_data[i] = 0;
+    }
+    for (int i = 0; i < (computeUnit()->wfSize() * 16); ++i) {
+        d_data[i] = 0;
+    }
+}
+
+GPUDynInst::~GPUDynInst()
+{
+    delete[] d_data;
+    delete[] a_data;
+    delete[] x_data;
 }
 
 void
diff --git a/src/gpu-compute/gpu_dyn_inst.hh b/src/gpu-compute/gpu_dyn_inst.hh
index e44d8f80d..46774d867 100644
--- a/src/gpu-compute/gpu_dyn_inst.hh
+++ b/src/gpu-compute/gpu_dyn_inst.hh
@@ -205,7 +205,7 @@ class GPUDynInst : public GPUExecContext
   public:
     GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *_staticInst,
                uint64_t instSeqNum);
-
+    ~GPUDynInst();
     void execute();
     int numSrcRegOperands();
     int numDstRegOperands();
@@ -226,15 +226,15 @@ class GPUDynInst : public GPUExecContext
     Enums::StorageClassType executedAs();
 
     // The address of the memory operation
-    Addr addr[VSZ];
+    std::vector<Addr> addr;
     Addr pAddr;
 
     // The data to get written
-    uint8_t d_data[VSZ * 16];
+    uint8_t *d_data;
     // Additional data (for atomics)
-    uint8_t a_data[VSZ * 8];
+    uint8_t *a_data;
     // Additional data (for atomics)
-    uint8_t x_data[VSZ * 8];
+    uint8_t *x_data;
     // The execution mask
     VectorMask exec_mask;
 
diff --git a/src/gpu-compute/local_memory_pipeline.cc b/src/gpu-compute/local_memory_pipeline.cc
index 7f919c5f4..a970d8f9b 100644
--- a/src/gpu-compute/local_memory_pipeline.cc
+++ b/src/gpu-compute/local_memory_pipeline.cc
@@ -148,9 +148,9 @@ LocalMemPipeline::doSmReturn(GPUDynInstPtr m)
             int physVgpr = w->remap(dst,sizeof(c0),1);
             // save the physical VGPR index
             regVec.push_back(physVgpr);
-            c1 *p1 = &((c1*)m->d_data)[k * VSZ];
+            c1 *p1 = &((c1 *)m->d_data)[k * w->computeUnit->wfSize()];
 
-            for (int i = 0; i < VSZ; ++i) {
+            for (int i = 0; i < w->computeUnit->wfSize(); ++i) {
                 if (m->exec_mask[i]) {
                     // write the value into the physical VGPR. This is a purely
                     // functional operation. No timing is modeled.
diff --git a/src/gpu-compute/misc.hh b/src/gpu-compute/misc.hh
index 4f8032832..5ade89789 100644
--- a/src/gpu-compute/misc.hh
+++ b/src/gpu-compute/misc.hh
@@ -37,28 +37,14 @@
 #define __MISC_HH__
 
 #include <bitset>
+#include <limits>
 #include <memory>
 
 #include "base/misc.hh"
 
 class GPUDynInst;
 
-// wavefront size of the machine
-static const int VSZ = 64;
-
-/*
- This check is necessary because std::bitset only provides conversion to
- unsigned long or unsigned long long via to_ulong() or to_ullong(). there are
- a few places in the code where to_ullong() is used, however if VSZ is larger
- than a value the host can support then bitset will throw a runtime exception.
-
- we should remove all use of to_long() or to_ullong() so we can have VSZ
- greater than 64b, however until that is done this assert is required.
- */
-static_assert(VSZ <= sizeof(unsigned long long) * 8,
-              "VSZ is larger than the host can support");
-
-typedef std::bitset<VSZ> VectorMask;
+typedef std::bitset<std::numeric_limits<unsigned long long>::digits> VectorMask;
 typedef std::shared_ptr<GPUDynInst> GPUDynInstPtr;
 
 class WaitClass
diff --git a/src/gpu-compute/qstruct.hh b/src/gpu-compute/qstruct.hh
index 092303c00..7bca757b8 100644
--- a/src/gpu-compute/qstruct.hh
+++ b/src/gpu-compute/qstruct.hh
@@ -100,7 +100,7 @@ struct WFContext
 {
     // 32 bit values
     // barrier state
-    int bar_cnt[VSZ];
+    std::vector<int> bar_cnt;
 
     // id (which WF in the WG)
     int cnt;
diff --git a/src/gpu-compute/vector_register_file.cc b/src/gpu-compute/vector_register_file.cc
index 8b7dc0691..c43d765af 100644
--- a/src/gpu-compute/vector_register_file.cc
+++ b/src/gpu-compute/vector_register_file.cc
@@ -63,7 +63,7 @@ VectorRegisterFile::VectorRegisterFile(const VectorRegisterFileParams *p)
     nxtBusy.clear();
     nxtBusy.resize(numRegsPerSimd, 0);
 
-    vgprState->init(numRegsPerSimd);
+    vgprState->init(numRegsPerSimd, p->wfSize);
 }
 
 void
diff --git a/src/gpu-compute/vector_register_state.cc b/src/gpu-compute/vector_register_state.cc
index f231b0579..e177d3b64 100644
--- a/src/gpu-compute/vector_register_state.cc
+++ b/src/gpu-compute/vector_register_state.cc
@@ -35,6 +35,8 @@
 
 #include "gpu-compute/vector_register_state.hh"
 
+#include <limits>
+
 #include "gpu-compute/compute_unit.hh"
 
 VecRegisterState::VecRegisterState() : computeUnit(nullptr)
@@ -51,8 +53,19 @@ VecRegisterState::setParent(ComputeUnit *_computeUnit)
 }
 
 void
-VecRegisterState::init(uint32_t _size)
+VecRegisterState::init(uint32_t _size, uint32_t wf_size)
 {
     s_reg.resize(_size);
+    fatal_if(wf_size > std::numeric_limits<unsigned long long>::digits ||
+             wf_size <= 0,
+             "WF size is larger than the host can support or is zero");
+    fatal_if((wf_size & (wf_size - 1)) != 0,
+             "Wavefront size should be a power of 2");
+    for (int i = 0; i < s_reg.size(); ++i) {
+        s_reg[i].resize(wf_size, 0);
+    }
     d_reg.resize(_size);
+    for (int i = 0; i < d_reg.size(); ++i) {
+        d_reg[i].resize(wf_size, 0);
+    }
 }
diff --git a/src/gpu-compute/vector_register_state.hh b/src/gpu-compute/vector_register_state.hh
index a233b9acc..97a0d8e25 100644
--- a/src/gpu-compute/vector_register_state.hh
+++ b/src/gpu-compute/vector_register_state.hh
@@ -51,7 +51,7 @@ class VecRegisterState
 {
   public:
     VecRegisterState();
-    void init(uint32_t _size);
+    void init(uint32_t _size, uint32_t wf_size);
 
     const std::string& name() const { return _name; }
     void setParent(ComputeUnit *_computeUnit);
@@ -93,9 +93,9 @@ class VecRegisterState
     ComputeUnit *computeUnit;
     std::string _name;
     // 32-bit Single Precision Vector Register State
-    std::vector<std::array<uint32_t, VSZ>> s_reg;
+    std::vector<std::vector<uint32_t>> s_reg;
     // 64-bit Double Precision Vector Register State
-    std::vector<std::array<uint64_t, VSZ>> d_reg;
+    std::vector<std::vector<uint64_t>> d_reg;
 };
 
 #endif // __VECTOR_REGISTER_STATE_HH__
diff --git a/src/gpu-compute/wavefront.cc b/src/gpu-compute/wavefront.cc
index 7cdec53e5..a20330082 100644
--- a/src/gpu-compute/wavefront.cc
+++ b/src/gpu-compute/wavefront.cc
@@ -55,7 +55,6 @@ Wavefront::Wavefront(const Params *p)
     last_trace = 0;
     simdId = p->simdId;
     wfSlotId = p->wf_slot_id;
-
     status = S_STOPPED;
     reservedVectorRegs = 0;
     startVgprIndex = 0;
@@ -77,12 +76,20 @@ Wavefront::Wavefront(const Params *p)
     mem_trace_busy = 0;
     old_vgpr_tcnt = 0xffffffffffffffffll;
     old_dgpr_tcnt = 0xffffffffffffffffll;
+    old_vgpr.resize(p->wfSize);
 
     pendingFetch = false;
     dropFetch = false;
     condRegState = new ConditionRegisterState();
     maxSpVgprs = 0;
     maxDpVgprs = 0;
+    last_addr.resize(p->wfSize);
+    workitemFlatId.resize(p->wfSize);
+    old_dgpr.resize(p->wfSize);
+    bar_cnt.resize(p->wfSize);
+    for (int i = 0; i < 3; ++i) {
+        workitemid[i].resize(p->wfSize);
+    }
 }
 
 void
@@ -144,6 +151,7 @@ Wavefront::~Wavefront()
 {
     if (callArgMem)
         delete callArgMem;
+    delete condRegState;
 }
 
 void
diff --git a/src/gpu-compute/wavefront.hh b/src/gpu-compute/wavefront.hh
index 0abab8e83..5a5386a3d 100644
--- a/src/gpu-compute/wavefront.hh
+++ b/src/gpu-compute/wavefront.hh
@@ -83,6 +83,7 @@ class CallArgMem
   public:
     // pointer to buffer for storing function arguments
     uint8_t *mem;
+    int wfSize;
     // size of function args
     int funcArgsSizePerItem;
 
@@ -90,13 +91,13 @@ class CallArgMem
     int
     getLaneOffset(int lane, int addr)
     {
-        return addr * VSZ + sizeof(CType) * lane;
+        return addr * wfSize + sizeof(CType) * lane;
     }
 
-    CallArgMem(int func_args_size_per_item)
-      : funcArgsSizePerItem(func_args_size_per_item)
+    CallArgMem(int func_args_size_per_item, int wf_size)
+        : wfSize(wf_size), funcArgsSizePerItem(func_args_size_per_item)
     {
-        mem = (uint8_t*)malloc(funcArgsSizePerItem * VSZ);
+        mem = (uint8_t*)malloc(funcArgsSizePerItem * wfSize);
     }
 
     ~CallArgMem()
@@ -192,9 +193,9 @@ class Wavefront : public SimObject
     bool isOldestInstALU();
     bool isOldestInstBarrier();
     // used for passing spill address to DDInstGPU
-    uint64_t last_addr[VSZ];
-    uint32_t workitemid[3][VSZ];
-    uint32_t workitemFlatId[VSZ];
+    std::vector<Addr> last_addr;
+    std::vector<uint32_t> workitemid[3];
+    std::vector<uint32_t> workitemFlatId;
     uint32_t workgroupid[3];
     uint32_t workgroupsz[3];
     uint32_t gridsz[3];
@@ -230,14 +231,14 @@ class Wavefront : public SimObject
     uint32_t startVgprIndex;
 
     // Old value of destination gpr (for trace)
-    uint32_t old_vgpr[VSZ];
+    std::vector<uint32_t> old_vgpr;
     // Id of destination gpr (for trace)
     uint32_t old_vgpr_id;
     // Tick count of last old_vgpr copy
     uint64_t old_vgpr_tcnt;
 
     // Old value of destination gpr (for trace)
-    uint64_t old_dgpr[VSZ];
+    std::vector<uint64_t> old_dgpr;
     // Id of destination gpr (for trace)
     uint32_t old_dgpr_id;
     // Tick count of last old_vgpr copy
@@ -247,7 +248,7 @@ class Wavefront : public SimObject
     VectorMask init_mask;
 
     // number of barriers this WF has joined
-    int bar_cnt[VSZ];
+    std::vector<int> bar_cnt;
     int max_bar_cnt;
     // Flag to stall a wave on barrier
     bool stalledAtBarrier;
@@ -296,9 +297,9 @@ class Wavefront : public SimObject
     // argument memory for hsail call instruction
     CallArgMem *callArgMem;
     void
-    initCallArgMem(int func_args_size_per_item)
+    initCallArgMem(int func_args_size_per_item, int wf_size)
     {
-        callArgMem = new CallArgMem(func_args_size_per_item);
+        callArgMem = new CallArgMem(func_args_size_per_item, wf_size);
     }
 
     template<typename CType>
@@ -327,7 +328,6 @@ class Wavefront : public SimObject
     }
 
     void start(uint64_t _wfDynId, uint64_t _base_ptr);
-
     void exec();
     void updateResources();
     int ready(itype_e type);
author	jkalamat <john.kalamatianos@amd.com>	2016-06-09 11:24:55 -0400
committer	jkalamat <john.kalamatianos@amd.com>	2016-06-09 11:24:55 -0400
commit	3724fb15faafaaca54cc7a500df9c1490a387049 (patch)
tree	bbd671b68ba971087a1cd45b208947c09a622d38 /src/gpu-compute
parent	e5b7b6780f9748b6f13ef91e3e22d53ebdf47968 (diff)
download	gem5-3724fb15faafaaca54cc7a500df9c1490a387049.tar.xz