diff options
Diffstat (limited to 'src/gpu-compute')
-rw-r--r-- | src/gpu-compute/GPU.py | 2 | ||||
-rw-r--r-- | src/gpu-compute/cl_driver.cc | 2 | ||||
-rw-r--r-- | src/gpu-compute/compute_unit.cc | 117 | ||||
-rw-r--r-- | src/gpu-compute/compute_unit.hh | 18 | ||||
-rw-r--r-- | src/gpu-compute/dispatcher.cc | 6 | ||||
-rw-r--r-- | src/gpu-compute/dispatcher.hh | 1 | ||||
-rw-r--r-- | src/gpu-compute/global_memory_pipeline.cc | 4 | ||||
-rw-r--r-- | src/gpu-compute/gpu_dyn_inst.cc | 22 | ||||
-rw-r--r-- | src/gpu-compute/gpu_dyn_inst.hh | 10 | ||||
-rw-r--r-- | src/gpu-compute/local_memory_pipeline.cc | 4 | ||||
-rw-r--r-- | src/gpu-compute/misc.hh | 18 | ||||
-rw-r--r-- | src/gpu-compute/qstruct.hh | 2 | ||||
-rw-r--r-- | src/gpu-compute/vector_register_file.cc | 2 | ||||
-rw-r--r-- | src/gpu-compute/vector_register_state.cc | 15 | ||||
-rw-r--r-- | src/gpu-compute/vector_register_state.hh | 6 | ||||
-rw-r--r-- | src/gpu-compute/wavefront.cc | 10 | ||||
-rw-r--r-- | src/gpu-compute/wavefront.hh | 26 |
17 files changed, 151 insertions, 114 deletions
diff --git a/src/gpu-compute/GPU.py b/src/gpu-compute/GPU.py index bd95f6335..f580a09f7 100644 --- a/src/gpu-compute/GPU.py +++ b/src/gpu-compute/GPU.py @@ -59,6 +59,7 @@ class VectorRegisterFile(SimObject): simd_id = Param.Int(0, 'SIMD ID associated with this VRF') num_regs_per_simd = Param.Int(2048, 'number of vector registers per SIMD') + wfSize = Param.Int(64, 'Wavefront size (in work items)') min_alloc = Param.Int(4, 'min number of VGPRs allocated per WF') class Wavefront(SimObject): @@ -68,6 +69,7 @@ class Wavefront(SimObject): simdId = Param.Int('SIMD id (0-ComputeUnit.num_SIMDs)') wf_slot_id = Param.Int('wavefront id (0-ComputeUnit.max_wfs)') + wfSize = Param.Int(64, 'Wavefront size (in work items)') class ComputeUnit(MemObject): type = 'ComputeUnit' diff --git a/src/gpu-compute/cl_driver.cc b/src/gpu-compute/cl_driver.cc index 3b3291c03..6bb6be102 100644 --- a/src/gpu-compute/cl_driver.cc +++ b/src/gpu-compute/cl_driver.cc @@ -238,7 +238,7 @@ ClDriver::ioctl(LiveProcess *process, ThreadContext *tc, unsigned req) case HSA_GET_VSZ: { BufferArg buf(buf_addr, sizeof(uint32_t)); - *((uint32_t*)buf.bufferPtr()) = VSZ; + *((uint32_t*)buf.bufferPtr()) = dispatcher->wfSize(); buf.copyOut(tc->getMemProxy()); } break; diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc index b3a99b182..5ec061172 100644 --- a/src/gpu-compute/compute_unit.cc +++ b/src/gpu-compute/compute_unit.cc @@ -32,9 +32,10 @@ * * Author: John Kalamatianos, Anthony Gutierrez */ - #include "gpu-compute/compute_unit.hh" +#include <limits> + #include "base/output.hh" #include "debug/GPUDisp.hh" #include "debug/GPUExec.hh" @@ -76,14 +77,27 @@ ComputeUnit::ComputeUnit(const Params *p) : MemObject(p), fetchStage(p), _masterId(p->system->getMasterId(name() + ".ComputeUnit")), lds(*p->localDataStore), globalSeqNum(0), wavefrontSize(p->wfSize) { - // this check will be eliminated once we have wavefront size support added - fatal_if(p->wfSize != VSZ, "Wavefront size parameter does not match VSZ"); + /** + * This check is necessary because std::bitset only provides conversion + * to unsigned long or unsigned long long via to_ulong() or to_ullong(). + * there are * a few places in the code where to_ullong() is used, however + * if VSZ is larger than a value the host can support then bitset will + * throw a runtime exception. we should remove all use of to_long() or + * to_ullong() so we can have VSZ greater than 64b, however until that is + * done this assert is required. + */ + fatal_if(p->wfSize > std::numeric_limits<unsigned long long>::digits || + p->wfSize <= 0, + "WF size is larger than the host can support"); + fatal_if(!isPowerOf2(wavefrontSize), + "Wavefront size should be a power of 2"); // calculate how many cycles a vector load or store will need to transfer // its data over the corresponding buses - numCyclesPerStoreTransfer = (uint32_t)ceil((double)(VSZ * sizeof(uint32_t)) - / (double)vrfToCoalescerBusWidth); + numCyclesPerStoreTransfer = + (uint32_t)ceil((double)(wfSize() * sizeof(uint32_t)) / + (double)vrfToCoalescerBusWidth); - numCyclesPerLoadTransfer = (VSZ * sizeof(uint32_t)) + numCyclesPerLoadTransfer = (wfSize() * sizeof(uint32_t)) / coalescerToVrfBusWidth; lastVaddrWF.resize(numSIMDs); @@ -93,24 +107,24 @@ ComputeUnit::ComputeUnit(const Params *p) : MemObject(p), fetchStage(p), lastVaddrWF[j].resize(p->n_wf); for (int i = 0; i < p->n_wf; ++i) { - lastVaddrWF[j][i].resize(VSZ); + lastVaddrWF[j][i].resize(wfSize()); wfList[j].push_back(p->wavefronts[j * p->n_wf + i]); wfList[j][i]->setParent(this); - for (int k = 0; k < VSZ; ++k) { + for (int k = 0; k < wfSize(); ++k) { lastVaddrWF[j][i][k] = 0; } } } - lastVaddrPhase.resize(numSIMDs); + lastVaddrSimd.resize(numSIMDs); for (int i = 0; i < numSIMDs; ++i) { - lastVaddrPhase[i] = LastVaddrWave(); + lastVaddrSimd[i].resize(wfSize(), 0); } - lastVaddrCU = LastVaddrWave(); + lastVaddrCU.resize(wfSize()); lds.setParent(this); @@ -122,10 +136,10 @@ ComputeUnit::ComputeUnit(const Params *p) : MemObject(p), fetchStage(p), fatal("Invalid WF execution policy (CU)\n"); } - memPort.resize(VSZ); + memPort.resize(wfSize()); // resize the tlbPort vectorArray - int tlbPort_width = perLaneTLB ? VSZ : 1; + int tlbPort_width = perLaneTLB ? wfSize() : 1; tlbPort.resize(tlbPort_width); cuExitCallback = new CUExitCallback(this); @@ -144,12 +158,13 @@ ComputeUnit::ComputeUnit(const Params *p) : MemObject(p), fetchStage(p), ComputeUnit::~ComputeUnit() { // Delete wavefront slots - - for (int j = 0; j < numSIMDs; ++j) + for (int j = 0; j < numSIMDs; ++j) { for (int i = 0; i < shader->n_wf; ++i) { delete wfList[j][i]; } - + lastVaddrSimd[j].clear(); + } + lastVaddrCU.clear(); readyList.clear(); waveStatusList.clear(); dispatchList.clear(); @@ -187,27 +202,25 @@ ComputeUnit::InitializeWFContext(WFContext *wfCtx, NDRange *ndr, int cnt, VectorMask init_mask; init_mask.reset(); - for (int k = 0; k < VSZ; ++k) { - if (k + cnt * VSZ < trueWgSizeTotal) + for (int k = 0; k < wfSize(); ++k) { + if (k + cnt * wfSize() < trueWgSizeTotal) init_mask[k] = 1; } wfCtx->init_mask = init_mask.to_ullong(); wfCtx->exec_mask = init_mask.to_ullong(); - for (int i = 0; i < VSZ; ++i) { - wfCtx->bar_cnt[i] = 0; - } + wfCtx->bar_cnt.resize(wfSize(), 0); wfCtx->max_bar_cnt = 0; wfCtx->old_barrier_cnt = 0; wfCtx->barrier_cnt = 0; wfCtx->privBase = ndr->q.privMemStart; - ndr->q.privMemStart += ndr->q.privMemPerItem * VSZ; + ndr->q.privMemStart += ndr->q.privMemPerItem * wfSize(); wfCtx->spillBase = ndr->q.spillMemStart; - ndr->q.spillMemStart += ndr->q.spillMemPerItem * VSZ; + ndr->q.spillMemStart += ndr->q.spillMemPerItem * wfSize(); wfCtx->pc = 0; wfCtx->rpc = UINT32_MAX; @@ -265,10 +278,12 @@ ComputeUnit::StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[], w->dynwaveid = cnt; w->init_mask = wfCtx->init_mask; - for (int k = 0; k < VSZ; ++k) { - w->workitemid[0][k] = (k+cnt*VSZ) % trueWgSize[0]; - w->workitemid[1][k] = ((k + cnt * VSZ) / trueWgSize[0]) % trueWgSize[1]; - w->workitemid[2][k] = (k + cnt * VSZ) / (trueWgSize[0] * trueWgSize[1]); + for (int k = 0; k < wfSize(); ++k) { + w->workitemid[0][k] = (k+cnt*wfSize()) % trueWgSize[0]; + w->workitemid[1][k] = + ((k + cnt * wfSize()) / trueWgSize[0]) % trueWgSize[1]; + w->workitemid[2][k] = + (k + cnt * wfSize()) / (trueWgSize[0] * trueWgSize[1]); w->workitemFlatId[k] = w->workitemid[2][k] * trueWgSize[0] * trueWgSize[1] + w->workitemid[1][k] * trueWgSize[0] + @@ -277,9 +292,9 @@ ComputeUnit::StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[], w->old_barrier_cnt = wfCtx->old_barrier_cnt; w->barrier_cnt = wfCtx->barrier_cnt; - w->barrier_slots = divCeil(trueWgSizeTotal, VSZ); + w->barrier_slots = divCeil(trueWgSizeTotal, wfSize()); - for (int i = 0; i < VSZ; ++i) { + for (int i = 0; i < wfSize(); ++i) { w->bar_cnt[i] = wfCtx->bar_cnt[i]; } @@ -315,16 +330,17 @@ ComputeUnit::StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[], // is this the last wavefront in the workgroup // if set the spillWidth to be the remaining work-items // so that the vector access is correct - if ((cnt + 1) * VSZ >= trueWgSizeTotal) { - w->spillWidth = trueWgSizeTotal - (cnt * VSZ); + if ((cnt + 1) * wfSize() >= trueWgSizeTotal) { + w->spillWidth = trueWgSizeTotal - (cnt * wfSize()); } else { - w->spillWidth = VSZ; + w->spillWidth = wfSize(); } DPRINTF(GPUDisp, "Scheduling wfDynId/barrier_id %d/%d on CU%d: " "WF[%d][%d]\n", _n_wave, barrier_id, cu_id, w->simdId, w->wfSlotId); w->start(++_n_wave, ndr->q.code_ptr); + wfCtx->bar_cnt.clear(); } void @@ -339,7 +355,7 @@ ComputeUnit::StartWorkgroup(NDRange *ndr) // Send L1 cache acquire // isKernel + isAcquire = Kernel Begin if (shader->impl_kern_boundary_sync) { - GPUDynInstPtr gpuDynInst = std::make_shared<GPUDynInst>(nullptr, + GPUDynInstPtr gpuDynInst = std::make_shared<GPUDynInst>(this, nullptr, nullptr, 0); @@ -374,7 +390,7 @@ ComputeUnit::StartWorkgroup(NDRange *ndr) if (w->status == Wavefront::S_STOPPED) { // if we have scheduled all work items then stop // scheduling wavefronts - if (cnt * VSZ >= trueWgSizeTotal) + if (cnt * wfSize() >= trueWgSizeTotal) break; // reserve vector registers for the scheduled wavefront @@ -420,7 +436,7 @@ ComputeUnit::ReadyWorkgroup(NDRange *ndr) // work item of the work group int vregDemandPerWI = ndr->q.sRegCount + (2 * ndr->q.dRegCount); bool vregAvail = true; - int numWfs = (trueWgSizeTotal + VSZ - 1) / VSZ; + int numWfs = (trueWgSizeTotal + wfSize() - 1) / wfSize(); int freeWfSlots = 0; // check if the total number of VGPRs required by all WFs of the WG // fit in the VRFs of all SIMD units @@ -623,7 +639,7 @@ ComputeUnit::init() // Setup space for call args for (int j = 0; j < numSIMDs; ++j) { for (int i = 0; i < shader->n_wf; ++i) { - wfList[j][i]->initCallArgMem(shader->funcargs_size); + wfList[j][i]->initCallArgMem(shader->funcargs_size, wavefrontSize); } } @@ -1193,15 +1209,15 @@ ComputeUnit::DTLBPort::recvTimingResp(PacketPtr pkt) Addr last = 0; switch(computeUnit->prefetchType) { - case Enums::PF_CU: + case Enums::PF_CU: last = computeUnit->lastVaddrCU[mp_index]; break; - case Enums::PF_PHASE: - last = computeUnit->lastVaddrPhase[simdId][mp_index]; + case Enums::PF_PHASE: + last = computeUnit->lastVaddrSimd[simdId][mp_index]; break; - case Enums::PF_WF: + case Enums::PF_WF: last = computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index]; - default: + default: break; } @@ -1215,7 +1231,7 @@ ComputeUnit::DTLBPort::recvTimingResp(PacketPtr pkt) DPRINTF(GPUPrefetch, "Stride is %d\n", stride); computeUnit->lastVaddrCU[mp_index] = vaddr; - computeUnit->lastVaddrPhase[simdId][mp_index] = vaddr; + computeUnit->lastVaddrSimd[simdId][mp_index] = vaddr; computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index] = vaddr; stride = (computeUnit->prefetchType == Enums::PF_STRIDE) ? @@ -1488,7 +1504,7 @@ ComputeUnit::regStats() ; ldsBankConflictDist - .init(0, VSZ, 2) + .init(0, wfSize(), 2) .name(name() + ".lds_bank_conflicts") .desc("Number of bank conflicts per LDS memory packet") ; @@ -1499,27 +1515,28 @@ ComputeUnit::regStats() ; pageDivergenceDist - // A wavefront can touch 1 to VSZ pages per memory instruction. - // The number of pages per bin can be configured (here it's 4). - .init(1, VSZ, 4) + // A wavefront can touch up to N pages per memory instruction where + // N is equal to the wavefront size + // The number of pages per bin can be configured (here it's 4). + .init(1, wfSize(), 4) .name(name() + ".page_divergence_dist") .desc("pages touched per wf (over all mem. instr.)") ; controlFlowDivergenceDist - .init(1, VSZ, 4) + .init(1, wfSize(), 4) .name(name() + ".warp_execution_dist") .desc("number of lanes active per instruction (oval all instructions)") ; activeLanesPerGMemInstrDist - .init(1, VSZ, 4) + .init(1, wfSize(), 4) .name(name() + ".gmem_lanes_execution_dist") .desc("number of active lanes per global memory instruction") ; activeLanesPerLMemInstrDist - .init(1, VSZ, 4) + .init(1, wfSize(), 4) .name(name() + ".lmem_lanes_execution_dist") .desc("number of active lanes per local memory instruction") ; @@ -1531,7 +1548,7 @@ ComputeUnit::regStats() numVecOpsExecuted .name(name() + ".num_vec_ops_executed") - .desc("number of vec ops executed (e.g. VSZ/inst)") + .desc("number of vec ops executed (e.g. WF size/inst)") ; totalCycles diff --git a/src/gpu-compute/compute_unit.hh b/src/gpu-compute/compute_unit.hh index f47c27a0a..a234cbeb5 100644 --- a/src/gpu-compute/compute_unit.hh +++ b/src/gpu-compute/compute_unit.hh @@ -161,22 +161,8 @@ class ComputeUnit : public MemObject // if fixed-stride prefetching, this is the stride. int prefetchStride; - class LastVaddrWave - { - public: - Addr vaddrs[VSZ]; - Addr& operator[](int idx) { - return vaddrs[idx]; - } - - LastVaddrWave() { - for (int i = 0; i < VSZ; ++i) - vaddrs[i] = 0; - } - }; - - LastVaddrWave lastVaddrCU; - std::vector<LastVaddrWave> lastVaddrPhase; + std::vector<Addr> lastVaddrCU; + std::vector<std::vector<Addr>> lastVaddrSimd; std::vector<std::vector<std::vector<Addr>>> lastVaddrWF; Enums::PrefetchType prefetchType; EXEC_POLICY exec_policy; diff --git a/src/gpu-compute/dispatcher.cc b/src/gpu-compute/dispatcher.cc index 95c0c56a2..d1d011c0d 100644 --- a/src/gpu-compute/dispatcher.cc +++ b/src/gpu-compute/dispatcher.cc @@ -387,6 +387,12 @@ GpuDispatcher::getNumCUs() return shader->cuList.size(); } +int +GpuDispatcher::wfSize() const +{ + return shader->cuList[0]->wfSize(); +} + void GpuDispatcher::setFuncargsSize(int funcargs_size) { diff --git a/src/gpu-compute/dispatcher.hh b/src/gpu-compute/dispatcher.hh index 76f932655..e984af494 100644 --- a/src/gpu-compute/dispatcher.hh +++ b/src/gpu-compute/dispatcher.hh @@ -157,6 +157,7 @@ class GpuDispatcher : public DmaDevice // helper functions to retrieve/set GPU attributes int getNumCUs(); + int wfSize() const; void setFuncargsSize(int funcargs_size); }; diff --git a/src/gpu-compute/global_memory_pipeline.cc b/src/gpu-compute/global_memory_pipeline.cc index 355018666..a6a4d86db 100644 --- a/src/gpu-compute/global_memory_pipeline.cc +++ b/src/gpu-compute/global_memory_pipeline.cc @@ -179,9 +179,9 @@ GlobalMemPipeline::doGmReturn(GPUDynInstPtr m) int physVgpr = w->remap(dst, sizeof(c0), 1); // save the physical VGPR index regVec.push_back(physVgpr); - c1 *p1 = &((c1*)m->d_data)[k * VSZ]; + c1 *p1 = &((c1 *)m->d_data)[k * w->computeUnit->wfSize()]; - for (int i = 0; i < VSZ; ++i) { + for (int i = 0; i < w->computeUnit->wfSize(); ++i) { if (m->exec_mask[i]) { DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: " "$%s%d <- %d global ld done (src = wavefront " diff --git a/src/gpu-compute/gpu_dyn_inst.cc b/src/gpu-compute/gpu_dyn_inst.cc index 2f35a983c..1806e79e4 100644 --- a/src/gpu-compute/gpu_dyn_inst.cc +++ b/src/gpu-compute/gpu_dyn_inst.cc @@ -42,11 +42,29 @@ GPUDynInst::GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *_staticInst, uint64_t instSeqNum) - : GPUExecContext(_cu, _wf), m_op(Enums::MO_UNDEF), + : GPUExecContext(_cu, _wf), addr(computeUnit()->wfSize(), (Addr)0), + m_op(Enums::MO_UNDEF), memoryOrder(Enums::MEMORY_ORDER_NONE), n_reg(0), useContinuation(false), statusBitVector(0), staticInst(_staticInst), _seqNum(instSeqNum) { - tlbHitLevel.assign(VSZ, -1); + tlbHitLevel.assign(computeUnit()->wfSize(), -1); + d_data = new uint8_t[computeUnit()->wfSize() * 16]; + a_data = new uint8_t[computeUnit()->wfSize() * 8]; + x_data = new uint8_t[computeUnit()->wfSize() * 8]; + for (int i = 0; i < (computeUnit()->wfSize() * 8); ++i) { + a_data[i] = 0; + x_data[i] = 0; + } + for (int i = 0; i < (computeUnit()->wfSize() * 16); ++i) { + d_data[i] = 0; + } +} + +GPUDynInst::~GPUDynInst() +{ + delete[] d_data; + delete[] a_data; + delete[] x_data; } void diff --git a/src/gpu-compute/gpu_dyn_inst.hh b/src/gpu-compute/gpu_dyn_inst.hh index e44d8f80d..46774d867 100644 --- a/src/gpu-compute/gpu_dyn_inst.hh +++ b/src/gpu-compute/gpu_dyn_inst.hh @@ -205,7 +205,7 @@ class GPUDynInst : public GPUExecContext public: GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *_staticInst, uint64_t instSeqNum); - + ~GPUDynInst(); void execute(); int numSrcRegOperands(); int numDstRegOperands(); @@ -226,15 +226,15 @@ class GPUDynInst : public GPUExecContext Enums::StorageClassType executedAs(); // The address of the memory operation - Addr addr[VSZ]; + std::vector<Addr> addr; Addr pAddr; // The data to get written - uint8_t d_data[VSZ * 16]; + uint8_t *d_data; // Additional data (for atomics) - uint8_t a_data[VSZ * 8]; + uint8_t *a_data; // Additional data (for atomics) - uint8_t x_data[VSZ * 8]; + uint8_t *x_data; // The execution mask VectorMask exec_mask; diff --git a/src/gpu-compute/local_memory_pipeline.cc b/src/gpu-compute/local_memory_pipeline.cc index 7f919c5f4..a970d8f9b 100644 --- a/src/gpu-compute/local_memory_pipeline.cc +++ b/src/gpu-compute/local_memory_pipeline.cc @@ -148,9 +148,9 @@ LocalMemPipeline::doSmReturn(GPUDynInstPtr m) int physVgpr = w->remap(dst,sizeof(c0),1); // save the physical VGPR index regVec.push_back(physVgpr); - c1 *p1 = &((c1*)m->d_data)[k * VSZ]; + c1 *p1 = &((c1 *)m->d_data)[k * w->computeUnit->wfSize()]; - for (int i = 0; i < VSZ; ++i) { + for (int i = 0; i < w->computeUnit->wfSize(); ++i) { if (m->exec_mask[i]) { // write the value into the physical VGPR. This is a purely // functional operation. No timing is modeled. diff --git a/src/gpu-compute/misc.hh b/src/gpu-compute/misc.hh index 4f8032832..5ade89789 100644 --- a/src/gpu-compute/misc.hh +++ b/src/gpu-compute/misc.hh @@ -37,28 +37,14 @@ #define __MISC_HH__ #include <bitset> +#include <limits> #include <memory> #include "base/misc.hh" class GPUDynInst; -// wavefront size of the machine -static const int VSZ = 64; - -/* - This check is necessary because std::bitset only provides conversion to - unsigned long or unsigned long long via to_ulong() or to_ullong(). there are - a few places in the code where to_ullong() is used, however if VSZ is larger - than a value the host can support then bitset will throw a runtime exception. - - we should remove all use of to_long() or to_ullong() so we can have VSZ - greater than 64b, however until that is done this assert is required. - */ -static_assert(VSZ <= sizeof(unsigned long long) * 8, - "VSZ is larger than the host can support"); - -typedef std::bitset<VSZ> VectorMask; +typedef std::bitset<std::numeric_limits<unsigned long long>::digits> VectorMask; typedef std::shared_ptr<GPUDynInst> GPUDynInstPtr; class WaitClass diff --git a/src/gpu-compute/qstruct.hh b/src/gpu-compute/qstruct.hh index 092303c00..7bca757b8 100644 --- a/src/gpu-compute/qstruct.hh +++ b/src/gpu-compute/qstruct.hh @@ -100,7 +100,7 @@ struct WFContext { // 32 bit values // barrier state - int bar_cnt[VSZ]; + std::vector<int> bar_cnt; // id (which WF in the WG) int cnt; diff --git a/src/gpu-compute/vector_register_file.cc b/src/gpu-compute/vector_register_file.cc index 8b7dc0691..c43d765af 100644 --- a/src/gpu-compute/vector_register_file.cc +++ b/src/gpu-compute/vector_register_file.cc @@ -63,7 +63,7 @@ VectorRegisterFile::VectorRegisterFile(const VectorRegisterFileParams *p) nxtBusy.clear(); nxtBusy.resize(numRegsPerSimd, 0); - vgprState->init(numRegsPerSimd); + vgprState->init(numRegsPerSimd, p->wfSize); } void diff --git a/src/gpu-compute/vector_register_state.cc b/src/gpu-compute/vector_register_state.cc index f231b0579..e177d3b64 100644 --- a/src/gpu-compute/vector_register_state.cc +++ b/src/gpu-compute/vector_register_state.cc @@ -35,6 +35,8 @@ #include "gpu-compute/vector_register_state.hh" +#include <limits> + #include "gpu-compute/compute_unit.hh" VecRegisterState::VecRegisterState() : computeUnit(nullptr) @@ -51,8 +53,19 @@ VecRegisterState::setParent(ComputeUnit *_computeUnit) } void -VecRegisterState::init(uint32_t _size) +VecRegisterState::init(uint32_t _size, uint32_t wf_size) { s_reg.resize(_size); + fatal_if(wf_size > std::numeric_limits<unsigned long long>::digits || + wf_size <= 0, + "WF size is larger than the host can support or is zero"); + fatal_if((wf_size & (wf_size - 1)) != 0, + "Wavefront size should be a power of 2"); + for (int i = 0; i < s_reg.size(); ++i) { + s_reg[i].resize(wf_size, 0); + } d_reg.resize(_size); + for (int i = 0; i < d_reg.size(); ++i) { + d_reg[i].resize(wf_size, 0); + } } diff --git a/src/gpu-compute/vector_register_state.hh b/src/gpu-compute/vector_register_state.hh index a233b9acc..97a0d8e25 100644 --- a/src/gpu-compute/vector_register_state.hh +++ b/src/gpu-compute/vector_register_state.hh @@ -51,7 +51,7 @@ class VecRegisterState { public: VecRegisterState(); - void init(uint32_t _size); + void init(uint32_t _size, uint32_t wf_size); const std::string& name() const { return _name; } void setParent(ComputeUnit *_computeUnit); @@ -93,9 +93,9 @@ class VecRegisterState ComputeUnit *computeUnit; std::string _name; // 32-bit Single Precision Vector Register State - std::vector<std::array<uint32_t, VSZ>> s_reg; + std::vector<std::vector<uint32_t>> s_reg; // 64-bit Double Precision Vector Register State - std::vector<std::array<uint64_t, VSZ>> d_reg; + std::vector<std::vector<uint64_t>> d_reg; }; #endif // __VECTOR_REGISTER_STATE_HH__ diff --git a/src/gpu-compute/wavefront.cc b/src/gpu-compute/wavefront.cc index 7cdec53e5..a20330082 100644 --- a/src/gpu-compute/wavefront.cc +++ b/src/gpu-compute/wavefront.cc @@ -55,7 +55,6 @@ Wavefront::Wavefront(const Params *p) last_trace = 0; simdId = p->simdId; wfSlotId = p->wf_slot_id; - status = S_STOPPED; reservedVectorRegs = 0; startVgprIndex = 0; @@ -77,12 +76,20 @@ Wavefront::Wavefront(const Params *p) mem_trace_busy = 0; old_vgpr_tcnt = 0xffffffffffffffffll; old_dgpr_tcnt = 0xffffffffffffffffll; + old_vgpr.resize(p->wfSize); pendingFetch = false; dropFetch = false; condRegState = new ConditionRegisterState(); maxSpVgprs = 0; maxDpVgprs = 0; + last_addr.resize(p->wfSize); + workitemFlatId.resize(p->wfSize); + old_dgpr.resize(p->wfSize); + bar_cnt.resize(p->wfSize); + for (int i = 0; i < 3; ++i) { + workitemid[i].resize(p->wfSize); + } } void @@ -144,6 +151,7 @@ Wavefront::~Wavefront() { if (callArgMem) delete callArgMem; + delete condRegState; } void diff --git a/src/gpu-compute/wavefront.hh b/src/gpu-compute/wavefront.hh index 0abab8e83..5a5386a3d 100644 --- a/src/gpu-compute/wavefront.hh +++ b/src/gpu-compute/wavefront.hh @@ -83,6 +83,7 @@ class CallArgMem public: // pointer to buffer for storing function arguments uint8_t *mem; + int wfSize; // size of function args int funcArgsSizePerItem; @@ -90,13 +91,13 @@ class CallArgMem int getLaneOffset(int lane, int addr) { - return addr * VSZ + sizeof(CType) * lane; + return addr * wfSize + sizeof(CType) * lane; } - CallArgMem(int func_args_size_per_item) - : funcArgsSizePerItem(func_args_size_per_item) + CallArgMem(int func_args_size_per_item, int wf_size) + : wfSize(wf_size), funcArgsSizePerItem(func_args_size_per_item) { - mem = (uint8_t*)malloc(funcArgsSizePerItem * VSZ); + mem = (uint8_t*)malloc(funcArgsSizePerItem * wfSize); } ~CallArgMem() @@ -192,9 +193,9 @@ class Wavefront : public SimObject bool isOldestInstALU(); bool isOldestInstBarrier(); // used for passing spill address to DDInstGPU - uint64_t last_addr[VSZ]; - uint32_t workitemid[3][VSZ]; - uint32_t workitemFlatId[VSZ]; + std::vector<Addr> last_addr; + std::vector<uint32_t> workitemid[3]; + std::vector<uint32_t> workitemFlatId; uint32_t workgroupid[3]; uint32_t workgroupsz[3]; uint32_t gridsz[3]; @@ -230,14 +231,14 @@ class Wavefront : public SimObject uint32_t startVgprIndex; // Old value of destination gpr (for trace) - uint32_t old_vgpr[VSZ]; + std::vector<uint32_t> old_vgpr; // Id of destination gpr (for trace) uint32_t old_vgpr_id; // Tick count of last old_vgpr copy uint64_t old_vgpr_tcnt; // Old value of destination gpr (for trace) - uint64_t old_dgpr[VSZ]; + std::vector<uint64_t> old_dgpr; // Id of destination gpr (for trace) uint32_t old_dgpr_id; // Tick count of last old_vgpr copy @@ -247,7 +248,7 @@ class Wavefront : public SimObject VectorMask init_mask; // number of barriers this WF has joined - int bar_cnt[VSZ]; + std::vector<int> bar_cnt; int max_bar_cnt; // Flag to stall a wave on barrier bool stalledAtBarrier; @@ -296,9 +297,9 @@ class Wavefront : public SimObject // argument memory for hsail call instruction CallArgMem *callArgMem; void - initCallArgMem(int func_args_size_per_item) + initCallArgMem(int func_args_size_per_item, int wf_size) { - callArgMem = new CallArgMem(func_args_size_per_item); + callArgMem = new CallArgMem(func_args_size_per_item, wf_size); } template<typename CType> @@ -327,7 +328,6 @@ class Wavefront : public SimObject } void start(uint64_t _wfDynId, uint64_t _base_ptr); - void exec(); void updateResources(); int ready(itype_e type); |