17 files changed, 151 insertions, 114 deletions
diff --git a/src/gpu-compute/GPU.py b/src/gpu-compute/GPU.py
index bd95f6335..f580a09f7 100644
--- a/src/gpu-compute/GPU.py
+++ b/src/gpu-compute/GPU.py
@@ -59,6 +59,7 @@ class VectorRegisterFile(SimObject):
 
     simd_id = Param.Int(0, 'SIMD ID associated with this VRF')
     num_regs_per_simd = Param.Int(2048, 'number of vector registers per SIMD')
+    wfSize = Param.Int(64, 'Wavefront size (in work items)')
     min_alloc = Param.Int(4, 'min number of VGPRs allocated per WF')
 
 class Wavefront(SimObject):
@@ -68,6 +69,7 @@ class Wavefront(SimObject):
 
     simdId = Param.Int('SIMD id (0-ComputeUnit.num_SIMDs)')
     wf_slot_id = Param.Int('wavefront id (0-ComputeUnit.max_wfs)')
+    wfSize = Param.Int(64, 'Wavefront size (in work items)')
 
 class ComputeUnit(MemObject):
     type = 'ComputeUnit'
diff --git a/src/gpu-compute/cl_driver.cc b/src/gpu-compute/cl_driver.cc
index 3b3291c03..6bb6be102 100644
--- a/src/gpu-compute/cl_driver.cc
+++ b/src/gpu-compute/cl_driver.cc
@@ -238,7 +238,7 @@ ClDriver::ioctl(LiveProcess *process, ThreadContext *tc, unsigned req)
       case HSA_GET_VSZ:
         {
             BufferArg buf(buf_addr, sizeof(uint32_t));
-            *((uint32_t*)buf.bufferPtr()) = VSZ;
+            *((uint32_t*)buf.bufferPtr()) = dispatcher->wfSize();
             buf.copyOut(tc->getMemProxy());
         }
         break;
diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc
index b3a99b182..5ec061172 100644
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -32,9 +32,10 @@
  *
  * Author: John Kalamatianos, Anthony Gutierrez
  */
-
 #include "gpu-compute/compute_unit.hh"
 
+#include <limits>
+
 #include "base/output.hh"
 #include "debug/GPUDisp.hh"
 #include "debug/GPUExec.hh"
@@ -76,14 +77,27 @@ ComputeUnit::ComputeUnit(const Params *p) : MemObject(p), fetchStage(p),
     _masterId(p->system->getMasterId(name() + ".ComputeUnit")),
     lds(*p->localDataStore), globalSeqNum(0),  wavefrontSize(p->wfSize)
 {
-    // this check will be eliminated once we have wavefront size support added
-    fatal_if(p->wfSize != VSZ, "Wavefront size parameter does not match VSZ");
+    /**
+     * This check is necessary because std::bitset only provides conversion
+     * to unsigned long or unsigned long long via to_ulong() or to_ullong().
+     * there are * a few places in the code where to_ullong() is used, however
+     * if VSZ is larger than a value the host can support then bitset will
+     * throw a runtime exception. we should remove all use of to_long() or
+     * to_ullong() so we can have VSZ greater than 64b, however until that is
+     * done this assert is required.
+     */
+    fatal_if(p->wfSize > std::numeric_limits<unsigned long long>::digits ||
+             p->wfSize <= 0,
+             "WF size is larger than the host can support");
+    fatal_if(!isPowerOf2(wavefrontSize),
+             "Wavefront size should be a power of 2");
     // calculate how many cycles a vector load or store will need to transfer
     // its data over the corresponding buses
-    numCyclesPerStoreTransfer = (uint32_t)ceil((double)(VSZ * sizeof(uint32_t))
-                                / (double)vrfToCoalescerBusWidth);
+    numCyclesPerStoreTransfer =
+        (uint32_t)ceil((double)(wfSize() * sizeof(uint32_t)) /
+                (double)vrfToCoalescerBusWidth);
 
-    numCyclesPerLoadTransfer = (VSZ * sizeof(uint32_t))
+    numCyclesPerLoadTransfer = (wfSize() * sizeof(uint32_t))
                                / coalescerToVrfBusWidth;
 
     lastVaddrWF.resize(numSIMDs);
@@ -93,24 +107,24 @@ ComputeUnit::ComputeUnit(const Params *p) : MemObject(p), fetchStage(p),
         lastVaddrWF[j].resize(p->n_wf);
 
         for (int i = 0; i < p->n_wf; ++i) {
-            lastVaddrWF[j][i].resize(VSZ);
+            lastVaddrWF[j][i].resize(wfSize());
 
             wfList[j].push_back(p->wavefronts[j * p->n_wf + i]);
             wfList[j][i]->setParent(this);
 
-            for (int k = 0; k < VSZ; ++k) {
+            for (int k = 0; k < wfSize(); ++k) {
                 lastVaddrWF[j][i][k] = 0;
             }
         }
     }
 
-    lastVaddrPhase.resize(numSIMDs);
+    lastVaddrSimd.resize(numSIMDs);
 
     for (int i = 0; i < numSIMDs; ++i) {
-        lastVaddrPhase[i] = LastVaddrWave();
+        lastVaddrSimd[i].resize(wfSize(), 0);
     }
 
-    lastVaddrCU = LastVaddrWave();
+    lastVaddrCU.resize(wfSize());
 
     lds.setParent(this);
 
@@ -122,10 +136,10 @@ ComputeUnit::ComputeUnit(const Params *p) : MemObject(p), fetchStage(p),
         fatal("Invalid WF execution policy (CU)\n");
     }
 
-    memPort.resize(VSZ);
+    memPort.resize(wfSize());
 
     // resize the tlbPort vectorArray
-    int tlbPort_width = perLaneTLB ? VSZ : 1;
+    int tlbPort_width = perLaneTLB ? wfSize() : 1;
     tlbPort.resize(tlbPort_width);
 
     cuExitCallback = new CUExitCallback(this);
@@ -144,12 +158,13 @@ ComputeUnit::ComputeUnit(const Params *p) : MemObject(p), fetchStage(p),
 ComputeUnit::~ComputeUnit()
 {
     // Delete wavefront slots
-
-    for (int j = 0; j < numSIMDs; ++j)
+    for (int j = 0; j < numSIMDs; ++j) {
         for (int i = 0; i < shader->n_wf; ++i) {
             delete wfList[j][i];
         }
-
+        lastVaddrSimd[j].clear();
+    }
+    lastVaddrCU.clear();
     readyList.clear();
     waveStatusList.clear();
     dispatchList.clear();
@@ -187,27 +202,25 @@ ComputeUnit::InitializeWFContext(WFContext *wfCtx, NDRange *ndr, int cnt,
     VectorMask init_mask;
     init_mask.reset();
 
-    for (int k = 0; k < VSZ; ++k) {
-        if (k + cnt * VSZ < trueWgSizeTotal)
+    for (int k = 0; k < wfSize(); ++k) {
+        if (k + cnt * wfSize() < trueWgSizeTotal)
             init_mask[k] = 1;
     }
 
     wfCtx->init_mask = init_mask.to_ullong();
     wfCtx->exec_mask = init_mask.to_ullong();
 
-    for (int i = 0; i < VSZ; ++i) {
-        wfCtx->bar_cnt[i] = 0;
-    }
+    wfCtx->bar_cnt.resize(wfSize(), 0);
 
     wfCtx->max_bar_cnt = 0;
     wfCtx->old_barrier_cnt = 0;
     wfCtx->barrier_cnt = 0;
 
     wfCtx->privBase = ndr->q.privMemStart;
-    ndr->q.privMemStart += ndr->q.privMemPerItem * VSZ;
+    ndr->q.privMemStart += ndr->q.privMemPerItem * wfSize();
 
     wfCtx->spillBase = ndr->q.spillMemStart;
-    ndr->q.spillMemStart += ndr->q.spillMemPerItem * VSZ;
+    ndr->q.spillMemStart += ndr->q.spillMemPerItem * wfSize();
 
     wfCtx->pc = 0;
     wfCtx->rpc = UINT32_MAX;
@@ -265,10 +278,12 @@ ComputeUnit::StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[],
     w->dynwaveid = cnt;
     w->init_mask = wfCtx->init_mask;
 
-    for (int k = 0; k < VSZ; ++k) {
-        w->workitemid[0][k] = (k+cnt*VSZ) % trueWgSize[0];
-        w->workitemid[1][k] = ((k + cnt * VSZ) / trueWgSize[0]) % trueWgSize[1];
-        w->workitemid[2][k] = (k + cnt * VSZ) / (trueWgSize[0] * trueWgSize[1]);
+    for (int k = 0; k < wfSize(); ++k) {
+        w->workitemid[0][k] = (k+cnt*wfSize()) % trueWgSize[0];
+        w->workitemid[1][k] =
+            ((k + cnt * wfSize()) / trueWgSize[0]) % trueWgSize[1];
+        w->workitemid[2][k] =
+            (k + cnt * wfSize()) / (trueWgSize[0] * trueWgSize[1]);
 
         w->workitemFlatId[k] = w->workitemid[2][k] * trueWgSize[0] *
             trueWgSize[1] + w->workitemid[1][k] * trueWgSize[0] +
@@ -277,9 +292,9 @@ ComputeUnit::StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[],
 
     w->old_barrier_cnt = wfCtx->old_barrier_cnt;
     w->barrier_cnt = wfCtx->barrier_cnt;
-    w->barrier_slots = divCeil(trueWgSizeTotal, VSZ);
+    w->barrier_slots = divCeil(trueWgSizeTotal, wfSize());
 
-    for (int i = 0; i < VSZ; ++i) {
+    for (int i = 0; i < wfSize(); ++i) {
         w->bar_cnt[i] = wfCtx->bar_cnt[i];
     }
 
@@ -315,16 +330,17 @@ ComputeUnit::StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[],
     // is this the last wavefront in the workgroup
     // if set the spillWidth to be the remaining work-items
     // so that the vector access is correct
-    if ((cnt + 1) * VSZ >= trueWgSizeTotal) {
-        w->spillWidth = trueWgSizeTotal - (cnt * VSZ);
+    if ((cnt + 1) * wfSize() >= trueWgSizeTotal) {
+        w->spillWidth = trueWgSizeTotal - (cnt * wfSize());
     } else {
-        w->spillWidth = VSZ;
+        w->spillWidth = wfSize();
     }
 
     DPRINTF(GPUDisp, "Scheduling wfDynId/barrier_id %d/%d on CU%d: "
             "WF[%d][%d]\n", _n_wave, barrier_id, cu_id, w->simdId, w->wfSlotId);
 
     w->start(++_n_wave, ndr->q.code_ptr);
+    wfCtx->bar_cnt.clear();
 }
 
 void
@@ -339,7 +355,7 @@ ComputeUnit::StartWorkgroup(NDRange *ndr)
     // Send L1 cache acquire
     // isKernel + isAcquire = Kernel Begin
     if (shader->impl_kern_boundary_sync) {
-        GPUDynInstPtr gpuDynInst = std::make_shared<GPUDynInst>(nullptr,
+        GPUDynInstPtr gpuDynInst = std::make_shared<GPUDynInst>(this,
                                                                 nullptr,
                                                                 nullptr, 0);
 
@@ -374,7 +390,7 @@ ComputeUnit::StartWorkgroup(NDRange *ndr)
         if (w->status == Wavefront::S_STOPPED) {
             // if we have scheduled all work items then stop
             // scheduling wavefronts
-            if (cnt * VSZ >= trueWgSizeTotal)
+            if (cnt * wfSize() >= trueWgSizeTotal)
                 break;
 
             // reserve vector registers for the scheduled wavefront
@@ -420,7 +436,7 @@ ComputeUnit::ReadyWorkgroup(NDRange *ndr)
     // work item of the work group
     int vregDemandPerWI = ndr->q.sRegCount + (2 * ndr->q.dRegCount);
     bool vregAvail = true;
-    int numWfs = (trueWgSizeTotal + VSZ - 1) / VSZ;
+    int numWfs = (trueWgSizeTotal + wfSize() - 1) / wfSize();
     int freeWfSlots = 0;
     // check if the total number of VGPRs required by all WFs of the WG
     // fit in the VRFs of all SIMD units
@@ -623,7 +639,7 @@ ComputeUnit::init()
     // Setup space for call args
     for (int j = 0; j < numSIMDs; ++j) {
         for (int i = 0; i < shader->n_wf; ++i) {
-            wfList[j][i]->initCallArgMem(shader->funcargs_size);
+            wfList[j][i]->initCallArgMem(shader->funcargs_size, wavefrontSize);
         }
     }
 
@@ -1193,15 +1209,15 @@ ComputeUnit::DTLBPort::recvTimingResp(PacketPtr pkt)
         Addr last = 0;
 
         switch(computeUnit->prefetchType) {
-          case Enums::PF_CU:
+        case Enums::PF_CU:
             last = computeUnit->lastVaddrCU[mp_index];
             break;
-          case Enums::PF_PHASE:
-            last = computeUnit->lastVaddrPhase[simdId][mp_index];
+        case Enums::PF_PHASE:
+            last = computeUnit->lastVaddrSimd[simdId][mp_index];
             break;
-          case Enums::PF_WF:
+        case Enums::PF_WF:
             last = computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index];
-          default:
+        default:
             break;
         }
 
@@ -1215,7 +1231,7 @@ ComputeUnit::DTLBPort::recvTimingResp(PacketPtr pkt)
         DPRINTF(GPUPrefetch, "Stride is %d\n", stride);
 
         computeUnit->lastVaddrCU[mp_index] = vaddr;
-        computeUnit->lastVaddrPhase[simdId][mp_index] = vaddr;
+        computeUnit->lastVaddrSimd[simdId][mp_index] = vaddr;
         computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index] = vaddr;
 
         stride = (computeUnit->prefetchType == Enums::PF_STRIDE) ?
@@ -1488,7 +1504,7 @@ ComputeUnit::regStats()
         ;
 
     ldsBankConflictDist
-       .init(0, VSZ, 2)
+       .init(0, wfSize(), 2)
        .name(name() + ".lds_bank_conflicts")
        .desc("Number of bank conflicts per LDS memory packet")
        ;
@@ -1499,27 +1515,28 @@ ComputeUnit::regStats()
         ;
 
     pageDivergenceDist
-       // A wavefront can touch 1 to VSZ pages per memory instruction.
-       // The number of pages per bin can be configured (here it's 4).
-       .init(1, VSZ, 4)
+        // A wavefront can touch up to N pages per memory instruction where
+        // N is equal to the wavefront size
+        // The number of pages per bin can be configured (here it's 4).
+       .init(1, wfSize(), 4)
        .name(name() + ".page_divergence_dist")
        .desc("pages touched per wf (over all mem. instr.)")
        ;
 
     controlFlowDivergenceDist
-        .init(1, VSZ, 4)
+        .init(1, wfSize(), 4)
         .name(name() + ".warp_execution_dist")
         .desc("number of lanes active per instruction (oval all instructions)")
         ;
 
     activeLanesPerGMemInstrDist
-        .init(1, VSZ, 4)
+        .init(1, wfSize(), 4)
         .name(name() + ".gmem_lanes_execution_dist")
         .desc("number of active lanes per global memory instruction")
         ;
 
     activeLanesPerLMemInstrDist
-        .init(1, VSZ, 4)
+        .init(1, wfSize(), 4)
         .name(name() + ".lmem_lanes_execution_dist")
         .desc("number of active lanes per local memory instruction")
         ;
@@ -1531,7 +1548,7 @@ ComputeUnit::regStats()
 
     numVecOpsExecuted
         .name(name() + ".num_vec_ops_executed")
-        .desc("number of vec ops executed (e.g. VSZ/inst)")
+        .desc("number of vec ops executed (e.g. WF size/inst)")
         ;
 
     totalCycles
diff --git a/src/gpu-compute/compute_unit.hh b/src/gpu-compute/compute_unit.hh
index f47c27a0a..a234cbeb5 100644
--- a/src/gpu-compute/compute_unit.hh
+++ b/src/gpu-compute/compute_unit.hh
@@ -161,22 +161,8 @@ class ComputeUnit : public MemObject
     // if fixed-stride prefetching, this is the stride.
     int prefetchStride;
 
-    class LastVaddrWave
-    {
-      public:
-        Addr vaddrs[VSZ];
-        Addr& operator[](int idx) {
-            return vaddrs[idx];
-        }
-
-        LastVaddrWave() {
-            for (int i = 0; i < VSZ; ++i)
-                vaddrs[i] = 0;
-        }
-    };
-
-    LastVaddrWave lastVaddrCU;
-    std::vector<LastVaddrWave> lastVaddrPhase;
+    std::vector<Addr> lastVaddrCU;
+    std::vector<std::vector<Addr>> lastVaddrSimd;
     std::vector<std::vector<std::vector<Addr>>> lastVaddrWF;
     Enums::PrefetchType prefetchType;
     EXEC_POLICY exec_policy;
diff --git a/src/gpu-compute/dispatcher.cc b/src/gpu-compute/dispatcher.cc
index 95c0c56a2..d1d011c0d 100644
--- a/src/gpu-compute/dispatcher.cc
+++ b/src/gpu-compute/dispatcher.cc
@@ -387,6 +387,12 @@ GpuDispatcher::getNumCUs()
     return shader->cuList.size();
 }
 
+int
+GpuDispatcher::wfSize() const
+{
+    return shader->cuList[0]->wfSize();
+}
+
 void
 GpuDispatcher::setFuncargsSize(int funcargs_size)
 {
diff --git a/src/gpu-compute/dispatcher.hh b/src/gpu-compute/dispatcher.hh
index 76f932655..e984af494 100644
--- a/src/gpu-compute/dispatcher.hh
+++ b/src/gpu-compute/dispatcher.hh
@@ -157,6 +157,7 @@ class GpuDispatcher : public DmaDevice
 
         // helper functions to retrieve/set GPU attributes
         int getNumCUs();
+        int wfSize() const;
         void setFuncargsSize(int funcargs_size);
 };
 
diff --git a/src/gpu-compute/global_memory_pipeline.cc b/src/gpu-compute/global_memory_pipeline.cc
index 355018666..a6a4d86db 100644
--- a/src/gpu-compute/global_memory_pipeline.cc
+++ b/src/gpu-compute/global_memory_pipeline.cc
@@ -179,9 +179,9 @@ GlobalMemPipeline::doGmReturn(GPUDynInstPtr m)
                 int physVgpr = w->remap(dst, sizeof(c0), 1);
                 // save the physical VGPR index
                 regVec.push_back(physVgpr);
-                c1 *p1 = &((c1*)m->d_data)[k * VSZ];
+                c1 *p1 = &((c1 *)m->d_data)[k * w->computeUnit->wfSize()];
 
-                for (int i = 0; i < VSZ; ++i) {
+                for (int i = 0; i < w->computeUnit->wfSize(); ++i) {
                     if (m->exec_mask[i]) {
                         DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: "
                                 "$%s%d <- %d global ld done (src = wavefront "
diff --git a/src/gpu-compute/gpu_dyn_inst.cc b/src/gpu-compute/gpu_dyn_inst.cc
index 2f35a983c..1806e79e4 100644
--- a/src/gpu-compute/gpu_dyn_inst.cc
+++ b/src/gpu-compute/gpu_dyn_inst.cc
@@ -42,11 +42,29 @@
 
 GPUDynInst::GPUDynInst(ComputeUnit *_cu, Wavefront *_wf,
                        GPUStaticInst *_staticInst, uint64_t instSeqNum)
-    : GPUExecContext(_cu, _wf), m_op(Enums::MO_UNDEF),
+    : GPUExecContext(_cu, _wf), addr(computeUnit()->wfSize(), (Addr)0),
+      m_op(Enums::MO_UNDEF),
       memoryOrder(Enums::MEMORY_ORDER_NONE), n_reg(0), useContinuation(false),
       statusBitVector(0), staticInst(_staticInst), _seqNum(instSeqNum)
 {
-    tlbHitLevel.assign(VSZ, -1);
+    tlbHitLevel.assign(computeUnit()->wfSize(), -1);
+    d_data = new uint8_t[computeUnit()->wfSize() * 16];
+    a_data = new uint8_t[computeUnit()->wfSize() * 8];
+    x_data = new uint8_t[computeUnit()->wfSize() * 8];
+    for (int i = 0; i < (computeUnit()->wfSize() * 8); ++i) {
+        a_data[i] = 0;
+        x_data[i] = 0;
+    }
+    for (int i = 0; i < (computeUnit()->wfSize() * 16); ++i) {
+        d_data[i] = 0;
+    }
+}
+
+GPUDynInst::~GPUDynInst()
+{
+    delete[] d_data;
+    delete[] a_data;
+    delete[] x_data;
 }
 
 void
diff --git a/src/gpu-compute/gpu_dyn_inst.hh b/src/gpu-compute/gpu_dyn_inst.hh
index e44d8f80d..46774d867 100644
--- a/src/gpu-compute/gpu_dyn_inst.hh
+++ b/src/gpu-compute/gpu_dyn_inst.hh
@@ -205,7 +205,7 @@ class GPUDynInst : public GPUExecContext
   public:
     GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *_staticInst,
                uint64_t instSeqNum);
-
+    ~GPUDynInst();
     void execute();
     int numSrcRegOperands();
     int numDstRegOperands();
@@ -226,15 +226,15 @@ class GPUDynInst : public GPUExecContext
     Enums::StorageClassType executedAs();
 
     // The address of the memory operation
-    Addr addr[VSZ];
+    std::vector<Addr> addr;
     Addr pAddr;
 
     // The data to get written
-    uint8_t d_data[VSZ * 16];
+    uint8_t *d_data;
     // Additional data (for atomics)
-    uint8_t a_data[VSZ * 8];
+    uint8_t *a_data;
     // Additional data (for atomics)
-    uint8_t x_data[VSZ * 8];
+    uint8_t *x_data;
     // The execution mask
     VectorMask exec_mask;
 
diff --git a/src/gpu-compute/local_memory_pipeline.cc b/src/gpu-compute/local_memory_pipeline.cc
index 7f919c5f4..a970d8f9b 100644
--- a/src/gpu-compute/local_memory_pipeline.cc
+++ b/src/gpu-compute/local_memory_pipeline.cc
@@ -148,9 +148,9 @@ LocalMemPipeline::doSmReturn(GPUDynInstPtr m)
             int physVgpr = w->remap(dst,sizeof(c0),1);
             // save the physical VGPR index
             regVec.push_back(physVgpr);
-            c1 *p1 = &((c1*)m->d_data)[k * VSZ];
+            c1 *p1 = &((c1 *)m->d_data)[k * w->computeUnit->wfSize()];
 
-            for (int i = 0; i < VSZ; ++i) {
+            for (int i = 0; i < w->computeUnit->wfSize(); ++i) {
                 if (m->exec_mask[i]) {
                     // write the value into the physical VGPR. This is a purely
                     // functional operation. No timing is modeled.
diff --git a/src/gpu-compute/misc.hh b/src/gpu-compute/misc.hh
index 4f8032832..5ade89789 100644
--- a/src/gpu-compute/misc.hh
+++ b/src/gpu-compute/misc.hh
@@ -37,28 +37,14 @@
 #define __MISC_HH__
 
 #include <bitset>
+#include <limits>
 #include <memory>
 
 #include "base/misc.hh"
 
 class GPUDynInst;
 
-// wavefront size of the machine
-static const int VSZ = 64;
-
-/*
- This check is necessary because std::bitset only provides conversion to
- unsigned long or unsigned long long via to_ulong() or to_ullong(). there are
- a few places in the code where to_ullong() is used, however if VSZ is larger
- than a value the host can support then bitset will throw a runtime exception.
-
- we should remove all use of to_long() or to_ullong() so we can have VSZ
- greater than 64b, however until that is done this assert is required.
- */
-static_assert(VSZ <= sizeof(unsigned long long) * 8,
-              "VSZ is larger than the host can support");
-
-typedef std::bitset<VSZ> VectorMask;
+typedef std::bitset<std::numeric_limits<unsigned long long>::digits> VectorMask;
 typedef std::shared_ptr<GPUDynInst> GPUDynInstPtr;
 
 class WaitClass
diff --git a/src/gpu-compute/qstruct.hh b/src/gpu-compute/qstruct.hh
index 092303c00..7bca757b8 100644
--- a/src/gpu-compute/qstruct.hh
+++ b/src/gpu-compute/qstruct.hh
@@ -100,7 +100,7 @@ struct WFContext
 {
     // 32 bit values
     // barrier state
-    int bar_cnt[VSZ];
+    std::vector<int> bar_cnt;
 
     // id (which WF in the WG)
     int cnt;
diff --git a/src/gpu-compute/vector_register_file.cc b/src/gpu-compute/vector_register_file.cc
index 8b7dc0691..c43d765af 100644
--- a/src/gpu-compute/vector_register_file.cc
+++ b/src/gpu-compute/vector_register_file.cc
@@ -63,7 +63,7 @@ VectorRegisterFile::VectorRegisterFile(const VectorRegisterFileParams *p)
     nxtBusy.clear();
     nxtBusy.resize(numRegsPerSimd, 0);
 
-    vgprState->init(numRegsPerSimd);
+    vgprState->init(numRegsPerSimd, p->wfSize);
 }
 
 void
diff --git a/src/gpu-compute/vector_register_state.cc b/src/gpu-compute/vector_register_state.cc
index f231b0579..e177d3b64 100644
--- a/src/gpu-compute/vector_register_state.cc
+++ b/src/gpu-compute/vector_register_state.cc
@@ -35,6 +35,8 @@
 
 #include "gpu-compute/vector_register_state.hh"
 
+#include <limits>
+
 #include "gpu-compute/compute_unit.hh"
 
 VecRegisterState::VecRegisterState() : computeUnit(nullptr)
@@ -51,8 +53,19 @@ VecRegisterState::setParent(ComputeUnit *_computeUnit)
 }
 
 void
-VecRegisterState::init(uint32_t _size)
+VecRegisterState::init(uint32_t _size, uint32_t wf_size)
 {
     s_reg.resize(_size);
+    fatal_if(wf_size > std::numeric_limits<unsigned long long>::digits ||
+             wf_size <= 0,
+             "WF size is larger than the host can support or is zero");
+    fatal_if((wf_size & (wf_size - 1)) != 0,
+             "Wavefront size should be a power of 2");
+    for (int i = 0; i < s_reg.size(); ++i) {
+        s_reg[i].resize(wf_size, 0);
+    }
     d_reg.resize(_size);
+    for (int i = 0; i < d_reg.size(); ++i) {
+        d_reg[i].resize(wf_size, 0);
+    }
 }
diff --git a/src/gpu-compute/vector_register_state.hh b/src/gpu-compute/vector_register_state.hh
index a233b9acc..97a0d8e25 100644
--- a/src/gpu-compute/vector_register_state.hh
+++ b/src/gpu-compute/vector_register_state.hh
@@ -51,7 +51,7 @@ class VecRegisterState
 {
   public:
     VecRegisterState();
-    void init(uint32_t _size);
+    void init(uint32_t _size, uint32_t wf_size);
 
     const std::string& name() const { return _name; }
     void setParent(ComputeUnit *_computeUnit);
@@ -93,9 +93,9 @@ class VecRegisterState
     ComputeUnit *computeUnit;
     std::string _name;
     // 32-bit Single Precision Vector Register State
-    std::vector<std::array<uint32_t, VSZ>> s_reg;
+    std::vector<std::vector<uint32_t>> s_reg;
     // 64-bit Double Precision Vector Register State
-    std::vector<std::array<uint64_t, VSZ>> d_reg;
+    std::vector<std::vector<uint64_t>> d_reg;
 };
 
 #endif // __VECTOR_REGISTER_STATE_HH__
diff --git a/src/gpu-compute/wavefront.cc b/src/gpu-compute/wavefront.cc
index 7cdec53e5..a20330082 100644
--- a/src/gpu-compute/wavefront.cc
+++ b/src/gpu-compute/wavefront.cc
@@ -55,7 +55,6 @@ Wavefront::Wavefront(const Params *p)
     last_trace = 0;
     simdId = p->simdId;
     wfSlotId = p->wf_slot_id;
-
     status = S_STOPPED;
     reservedVectorRegs = 0;
     startVgprIndex = 0;
@@ -77,12 +76,20 @@ Wavefront::Wavefront(const Params *p)
     mem_trace_busy = 0;
     old_vgpr_tcnt = 0xffffffffffffffffll;
     old_dgpr_tcnt = 0xffffffffffffffffll;
+    old_vgpr.resize(p->wfSize);
 
     pendingFetch = false;
     dropFetch = false;
     condRegState = new ConditionRegisterState();
     maxSpVgprs = 0;
     maxDpVgprs = 0;
+    last_addr.resize(p->wfSize);
+    workitemFlatId.resize(p->wfSize);
+    old_dgpr.resize(p->wfSize);
+    bar_cnt.resize(p->wfSize);
+    for (int i = 0; i < 3; ++i) {
+        workitemid[i].resize(p->wfSize);
+    }
 }
 
 void
@@ -144,6 +151,7 @@ Wavefront::~Wavefront()
 {
     if (callArgMem)
         delete callArgMem;
+    delete condRegState;
 }
 
 void
diff --git a/src/gpu-compute/wavefront.hh b/src/gpu-compute/wavefront.hh
index 0abab8e83..5a5386a3d 100644
--- a/src/gpu-compute/wavefront.hh
+++ b/src/gpu-compute/wavefront.hh
@@ -83,6 +83,7 @@ class CallArgMem
   public:
     // pointer to buffer for storing function arguments
     uint8_t *mem;
+    int wfSize;
     // size of function args
     int funcArgsSizePerItem;
 
@@ -90,13 +91,13 @@ class CallArgMem
     int
     getLaneOffset(int lane, int addr)
     {
-        return addr * VSZ + sizeof(CType) * lane;
+        return addr * wfSize + sizeof(CType) * lane;
     }
 
-    CallArgMem(int func_args_size_per_item)
-      : funcArgsSizePerItem(func_args_size_per_item)
+    CallArgMem(int func_args_size_per_item, int wf_size)
+        : wfSize(wf_size), funcArgsSizePerItem(func_args_size_per_item)
     {
-        mem = (uint8_t*)malloc(funcArgsSizePerItem * VSZ);
+        mem = (uint8_t*)malloc(funcArgsSizePerItem * wfSize);
     }
 
     ~CallArgMem()
@@ -192,9 +193,9 @@ class Wavefront : public SimObject
     bool isOldestInstALU();
     bool isOldestInstBarrier();
     // used for passing spill address to DDInstGPU
-    uint64_t last_addr[VSZ];
-    uint32_t workitemid[3][VSZ];
-    uint32_t workitemFlatId[VSZ];
+    std::vector<Addr> last_addr;
+    std::vector<uint32_t> workitemid[3];
+    std::vector<uint32_t> workitemFlatId;
     uint32_t workgroupid[3];
     uint32_t workgroupsz[3];
     uint32_t gridsz[3];
@@ -230,14 +231,14 @@ class Wavefront : public SimObject
     uint32_t startVgprIndex;
 
     // Old value of destination gpr (for trace)
-    uint32_t old_vgpr[VSZ];
+    std::vector<uint32_t> old_vgpr;
     // Id of destination gpr (for trace)
     uint32_t old_vgpr_id;
     // Tick count of last old_vgpr copy
     uint64_t old_vgpr_tcnt;
 
     // Old value of destination gpr (for trace)
-    uint64_t old_dgpr[VSZ];
+    std::vector<uint64_t> old_dgpr;
     // Id of destination gpr (for trace)
     uint32_t old_dgpr_id;
     // Tick count of last old_vgpr copy
@@ -247,7 +248,7 @@ class Wavefront : public SimObject
     VectorMask init_mask;
 
     // number of barriers this WF has joined
-    int bar_cnt[VSZ];
+    std::vector<int> bar_cnt;
     int max_bar_cnt;
     // Flag to stall a wave on barrier
     bool stalledAtBarrier;
@@ -296,9 +297,9 @@ class Wavefront : public SimObject
     // argument memory for hsail call instruction
     CallArgMem *callArgMem;
     void
-    initCallArgMem(int func_args_size_per_item)
+    initCallArgMem(int func_args_size_per_item, int wf_size)
     {
-        callArgMem = new CallArgMem(func_args_size_per_item);
+        callArgMem = new CallArgMem(func_args_size_per_item, wf_size);
     }
 
     template<typename CType>
@@ -327,7 +328,6 @@ class Wavefront : public SimObject
     }
 
     void start(uint64_t _wfDynId, uint64_t _base_ptr);
-
     void exec();
     void updateResources();
     int ready(itype_e type);