diff options
Diffstat (limited to 'src/gpu-compute')
-rw-r--r-- | src/gpu-compute/global_memory_pipeline.cc | 153 | ||||
-rw-r--r-- | src/gpu-compute/global_memory_pipeline.hh | 8 | ||||
-rw-r--r-- | src/gpu-compute/gpu_dyn_inst.cc | 6 | ||||
-rw-r--r-- | src/gpu-compute/gpu_dyn_inst.hh | 4 | ||||
-rw-r--r-- | src/gpu-compute/local_memory_pipeline.cc | 126 | ||||
-rw-r--r-- | src/gpu-compute/local_memory_pipeline.hh | 8 |
6 files changed, 85 insertions, 220 deletions
diff --git a/src/gpu-compute/global_memory_pipeline.cc b/src/gpu-compute/global_memory_pipeline.cc index ab3e8c47e..f48af5a6f 100644 --- a/src/gpu-compute/global_memory_pipeline.cc +++ b/src/gpu-compute/global_memory_pipeline.cc @@ -65,13 +65,15 @@ GlobalMemPipeline::exec() !gmReturnedStores.empty() ? gmReturnedStores.front() : nullptr; bool accessVrf = true; + Wavefront *w = nullptr; + // check the VRF to see if the operands of a load (or load component // of an atomic) are accessible if ((m) && (m->isLoad() || m->isAtomicRet())) { - Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId]; + w = m->wavefront(); accessVrf = - w->computeUnit->vrf[m->simdId]-> + w->computeUnit->vrf[w->simdId]-> vrfOperandAccessReady(m->seqNum(), w, m, VrfAccessType::WRITE); } @@ -82,44 +84,38 @@ GlobalMemPipeline::exec() (computeUnit->shader->coissue_return || computeUnit->wfWait.at(m->pipeId).rdy())) { - if (m->v_type == VT_32 && m->m_type == Enums::M_U8) - doGmReturn<uint32_t, uint8_t>(m); - else if (m->v_type == VT_32 && m->m_type == Enums::M_U16) - doGmReturn<uint32_t, uint16_t>(m); - else if (m->v_type == VT_32 && m->m_type == Enums::M_U32) - doGmReturn<uint32_t, uint32_t>(m); - else if (m->v_type == VT_32 && m->m_type == Enums::M_S8) - doGmReturn<int32_t, int8_t>(m); - else if (m->v_type == VT_32 && m->m_type == Enums::M_S16) - doGmReturn<int32_t, int16_t>(m); - else if (m->v_type == VT_32 && m->m_type == Enums::M_S32) - doGmReturn<int32_t, int32_t>(m); - else if (m->v_type == VT_32 && m->m_type == Enums::M_F16) - doGmReturn<float, Float16>(m); - else if (m->v_type == VT_32 && m->m_type == Enums::M_F32) - doGmReturn<float, float>(m); - else if (m->v_type == VT_64 && m->m_type == Enums::M_U8) - doGmReturn<uint64_t, uint8_t>(m); - else if (m->v_type == VT_64 && m->m_type == Enums::M_U16) - doGmReturn<uint64_t, uint16_t>(m); - else if (m->v_type == VT_64 && m->m_type == Enums::M_U32) - doGmReturn<uint64_t, uint32_t>(m); - else if (m->v_type == VT_64 && m->m_type == Enums::M_U64) - doGmReturn<uint64_t, uint64_t>(m); - else if (m->v_type == VT_64 && m->m_type == Enums::M_S8) - doGmReturn<int64_t, int8_t>(m); - else if (m->v_type == VT_64 && m->m_type == Enums::M_S16) - doGmReturn<int64_t, int16_t>(m); - else if (m->v_type == VT_64 && m->m_type == Enums::M_S32) - doGmReturn<int64_t, int32_t>(m); - else if (m->v_type == VT_64 && m->m_type == Enums::M_S64) - doGmReturn<int64_t, int64_t>(m); - else if (m->v_type == VT_64 && m->m_type == Enums::M_F16) - doGmReturn<double, Float16>(m); - else if (m->v_type == VT_64 && m->m_type == Enums::M_F32) - doGmReturn<double, float>(m); - else if (m->v_type == VT_64 && m->m_type == Enums::M_F64) - doGmReturn<double, double>(m); + w = m->wavefront(); + + m->completeAcc(m); + + if (m->isLoad() || m->isAtomic()) { + gmReturnedLoads.pop(); + assert(inflightLoads > 0); + --inflightLoads; + } else { + assert(m->isStore()); + gmReturnedStores.pop(); + assert(inflightStores > 0); + --inflightStores; + } + + // Decrement outstanding register count + computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1); + + if (m->isStore() || m->isAtomic()) { + computeUnit->shader->ScheduleAdd(&w->outstandingReqsWrGm, + m->time, -1); + } + + if (m->isLoad() || m->isAtomic()) { + computeUnit->shader->ScheduleAdd(&w->outstandingReqsRdGm, + m->time, -1); + } + + // Mark write bus busy for appropriate amount of time + computeUnit->glbMemToVrfBus.set(m->time); + if (!computeUnit->shader->coissue_return) + w->computeUnit->wfWait.at(m->pipeId).set(m->time); } // If pipeline has executed a global memory instruction @@ -149,83 +145,6 @@ GlobalMemPipeline::exec() } } -template<typename c0, typename c1> -void -GlobalMemPipeline::doGmReturn(GPUDynInstPtr m) -{ - Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId]; - - // Return data to registers - if (m->isLoad() || m->isAtomic()) { - gmReturnedLoads.pop(); - assert(inflightLoads > 0); - --inflightLoads; - - if (m->isLoad() || m->isAtomicRet()) { - std::vector<uint32_t> regVec; - // iterate over number of destination register operands since - // this is a load or atomic operation - for (int k = 0; k < m->n_reg; ++k) { - assert((sizeof(c1) * m->n_reg) <= MAX_WIDTH_FOR_MEM_INST); - int dst = m->dst_reg + k; - - if (m->n_reg > MAX_REGS_FOR_NON_VEC_MEM_INST) - dst = m->dst_reg_vec[k]; - // virtual->physical VGPR mapping - int physVgpr = w->remap(dst, sizeof(c0), 1); - // save the physical VGPR index - regVec.push_back(physVgpr); - c1 *p1 = &((c1 *)m->d_data)[k * w->computeUnit->wfSize()]; - - for (int i = 0; i < w->computeUnit->wfSize(); ++i) { - if (m->exec_mask[i]) { - DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: " - "$%s%d <- %d global ld done (src = wavefront " - "ld inst)\n", w->computeUnit->cu_id, w->simdId, - w->wfSlotId, i, sizeof(c0) == 4 ? "s" : "d", - dst, *p1); - // write the value into the physical VGPR. This is a - // purely functional operation. No timing is modeled. - w->computeUnit->vrf[w->simdId]->write<c0>(physVgpr, - *p1, i); - } - ++p1; - } - } - - // Schedule the write operation of the load data on the VRF. - // This simply models the timing aspect of the VRF write operation. - // It does not modify the physical VGPR. - loadVrfBankConflictCycles += - w->computeUnit->vrf[w->simdId]->exec(m->seqNum(), - w, regVec, sizeof(c0), - m->time); - } - } else { - gmReturnedStores.pop(); - assert(inflightStores > 0); - --inflightStores; - } - - // Decrement outstanding register count - computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1); - - if (m->isStore() || m->isAtomic()) { - computeUnit->shader->ScheduleAdd(&w->outstandingReqsWrGm, m->time, - -1); - } - - if (m->isLoad() || m->isAtomic()) { - computeUnit->shader->ScheduleAdd(&w->outstandingReqsRdGm, m->time, - -1); - } - - // Mark write bus busy for appropriate amount of time - computeUnit->glbMemToVrfBus.set(m->time); - if (!computeUnit->shader->coissue_return) - w->computeUnit->wfWait.at(m->pipeId).set(m->time); -} - void GlobalMemPipeline::regStats() { diff --git a/src/gpu-compute/global_memory_pipeline.hh b/src/gpu-compute/global_memory_pipeline.hh index ed49f6f6b..368a15079 100644 --- a/src/gpu-compute/global_memory_pipeline.hh +++ b/src/gpu-compute/global_memory_pipeline.hh @@ -62,8 +62,6 @@ class GlobalMemPipeline void init(ComputeUnit *cu); void exec(); - template<typename c0, typename c1> void doGmReturn(GPUDynInstPtr m); - std::queue<GPUDynInstPtr> &getGMReqFIFO() { return gmIssuedRequests; } std::queue<GPUDynInstPtr> &getGMStRespFIFO() { return gmReturnedStores; } std::queue<GPUDynInstPtr> &getGMLdRespFIFO() { return gmReturnedLoads; } @@ -89,6 +87,12 @@ class GlobalMemPipeline const std::string &name() const { return _name; } void regStats(); + void + incLoadVRFBankConflictCycles(int num_cycles) + { + loadVrfBankConflictCycles += num_cycles; + } + private: ComputeUnit *computeUnit; std::string _name; diff --git a/src/gpu-compute/gpu_dyn_inst.cc b/src/gpu-compute/gpu_dyn_inst.cc index ec6340360..7092a7a40 100644 --- a/src/gpu-compute/gpu_dyn_inst.cc +++ b/src/gpu-compute/gpu_dyn_inst.cc @@ -155,6 +155,12 @@ GPUDynInst::initiateAcc(GPUDynInstPtr gpuDynInst) time = 0; } +void +GPUDynInst::completeAcc(GPUDynInstPtr gpuDynInst) +{ + _staticInst->completeAcc(gpuDynInst); +} + /** * accessor methods for the attributes of * the underlying GPU static instruction diff --git a/src/gpu-compute/gpu_dyn_inst.hh b/src/gpu-compute/gpu_dyn_inst.hh index c07d85d78..527b87b4c 100644 --- a/src/gpu-compute/gpu_dyn_inst.hh +++ b/src/gpu-compute/gpu_dyn_inst.hh @@ -258,6 +258,10 @@ class GPUDynInst : public GPUExecContext // Initiate the specified memory operation, by creating a // memory request and sending it off to the memory system. void initiateAcc(GPUDynInstPtr gpuDynInst); + // Complete the specified memory operation, by writing + // value back to the RF in the case of a load or atomic + // return or, in the case of a store, we do nothing + void completeAcc(GPUDynInstPtr gpuDynInst); void updateStats(); diff --git a/src/gpu-compute/local_memory_pipeline.cc b/src/gpu-compute/local_memory_pipeline.cc index 80dad6fcd..9e7dc6fb3 100644 --- a/src/gpu-compute/local_memory_pipeline.cc +++ b/src/gpu-compute/local_memory_pipeline.cc @@ -62,11 +62,13 @@ LocalMemPipeline::exec() lmReturnedRequests.front() : nullptr; bool accessVrf = true; + Wavefront *w = nullptr; + if ((m) && (m->isLoad() || m->isAtomicRet())) { - Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId]; + w = m->wavefront(); accessVrf = - w->computeUnit->vrf[m->simdId]-> + w->computeUnit->vrf[w->simdId]-> vrfOperandAccessReady(m->seqNum(), w, m, VrfAccessType::WRITE); } @@ -74,44 +76,29 @@ LocalMemPipeline::exec() if (!lmReturnedRequests.empty() && m->latency.rdy() && accessVrf && computeUnit->locMemToVrfBus.rdy() && (computeUnit->shader->coissue_return || computeUnit->wfWait.at(m->pipeId).rdy())) { - if (m->v_type == VT_32 && m->m_type == Enums::M_U8) - doSmReturn<uint32_t, uint8_t>(m); - else if (m->v_type == VT_32 && m->m_type == Enums::M_U16) - doSmReturn<uint32_t, uint16_t>(m); - else if (m->v_type == VT_32 && m->m_type == Enums::M_U32) - doSmReturn<uint32_t, uint32_t>(m); - else if (m->v_type == VT_32 && m->m_type == Enums::M_S8) - doSmReturn<int32_t, int8_t>(m); - else if (m->v_type == VT_32 && m->m_type == Enums::M_S16) - doSmReturn<int32_t, int16_t>(m); - else if (m->v_type == VT_32 && m->m_type == Enums::M_S32) - doSmReturn<int32_t, int32_t>(m); - else if (m->v_type == VT_32 && m->m_type == Enums::M_F16) - doSmReturn<float, Float16>(m); - else if (m->v_type == VT_32 && m->m_type == Enums::M_F32) - doSmReturn<float, float>(m); - else if (m->v_type == VT_64 && m->m_type == Enums::M_U8) - doSmReturn<uint64_t, uint8_t>(m); - else if (m->v_type == VT_64 && m->m_type == Enums::M_U16) - doSmReturn<uint64_t, uint16_t>(m); - else if (m->v_type == VT_64 && m->m_type == Enums::M_U32) - doSmReturn<uint64_t, uint32_t>(m); - else if (m->v_type == VT_64 && m->m_type == Enums::M_U64) - doSmReturn<uint64_t, uint64_t>(m); - else if (m->v_type == VT_64 && m->m_type == Enums::M_S8) - doSmReturn<int64_t, int8_t>(m); - else if (m->v_type == VT_64 && m->m_type == Enums::M_S16) - doSmReturn<int64_t, int16_t>(m); - else if (m->v_type == VT_64 && m->m_type == Enums::M_S32) - doSmReturn<int64_t, int32_t>(m); - else if (m->v_type == VT_64 && m->m_type == Enums::M_S64) - doSmReturn<int64_t, int64_t>(m); - else if (m->v_type == VT_64 && m->m_type == Enums::M_F16) - doSmReturn<double, Float16>(m); - else if (m->v_type == VT_64 && m->m_type == Enums::M_F32) - doSmReturn<double, float>(m); - else if (m->v_type == VT_64 && m->m_type == Enums::M_F64) - doSmReturn<double, double>(m); + + lmReturnedRequests.pop(); + w = m->wavefront(); + + m->completeAcc(m); + + // Decrement outstanding request count + computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1); + + if (m->isStore() || m->isAtomic()) { + computeUnit->shader->ScheduleAdd(&w->outstandingReqsWrLm, + m->time, -1); + } + + if (m->isLoad() || m->isAtomic()) { + computeUnit->shader->ScheduleAdd(&w->outstandingReqsRdLm, + m->time, -1); + } + + // Mark write bus busy for appropriate amount of time + computeUnit->locMemToVrfBus.set(m->time); + if (computeUnit->shader->coissue_return == 0) + w->computeUnit->wfWait.at(m->pipeId).set(m->time); } // If pipeline has executed a local memory instruction @@ -129,65 +116,6 @@ LocalMemPipeline::exec() } } -template<typename c0, typename c1> -void -LocalMemPipeline::doSmReturn(GPUDynInstPtr m) -{ - lmReturnedRequests.pop(); - Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId]; - - // Return data to registers - if (m->isLoad() || m->isAtomicRet()) { - std::vector<uint32_t> regVec; - for (int k = 0; k < m->n_reg; ++k) { - int dst = m->dst_reg+k; - - if (m->n_reg > MAX_REGS_FOR_NON_VEC_MEM_INST) - dst = m->dst_reg_vec[k]; - // virtual->physical VGPR mapping - int physVgpr = w->remap(dst,sizeof(c0),1); - // save the physical VGPR index - regVec.push_back(physVgpr); - c1 *p1 = &((c1 *)m->d_data)[k * w->computeUnit->wfSize()]; - - for (int i = 0; i < w->computeUnit->wfSize(); ++i) { - if (m->exec_mask[i]) { - // write the value into the physical VGPR. This is a purely - // functional operation. No timing is modeled. - w->computeUnit->vrf[w->simdId]->write<c0>(physVgpr, - *p1, i); - } - ++p1; - } - } - - // Schedule the write operation of the load data on the VRF. This simply - // models the timing aspect of the VRF write operation. It does not - // modify the physical VGPR. - loadVrfBankConflictCycles += - w->computeUnit->vrf[w->simdId]->exec(m->seqNum(), w, - regVec, sizeof(c0), m->time); - } - - // Decrement outstanding request count - computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1); - - if (m->isStore() || m->isAtomic()) { - computeUnit->shader->ScheduleAdd(&w->outstandingReqsWrLm, - m->time, -1); - } - - if (m->isLoad() || m->isAtomic()) { - computeUnit->shader->ScheduleAdd(&w->outstandingReqsRdLm, - m->time, -1); - } - - // Mark write bus busy for appropriate amount of time - computeUnit->locMemToVrfBus.set(m->time); - if (computeUnit->shader->coissue_return == 0) - w->computeUnit->wfWait.at(m->pipeId).set(m->time); -} - void LocalMemPipeline::regStats() { diff --git a/src/gpu-compute/local_memory_pipeline.hh b/src/gpu-compute/local_memory_pipeline.hh index a63d867d0..e0a21fd82 100644 --- a/src/gpu-compute/local_memory_pipeline.hh +++ b/src/gpu-compute/local_memory_pipeline.hh @@ -61,8 +61,6 @@ class LocalMemPipeline void init(ComputeUnit *cu); void exec(); - template<typename c0, typename c1> void doSmReturn(GPUDynInstPtr m); - std::queue<GPUDynInstPtr> &getLMReqFIFO() { return lmIssuedRequests; } std::queue<GPUDynInstPtr> &getLMRespFIFO() { return lmReturnedRequests; } @@ -81,6 +79,12 @@ class LocalMemPipeline const std::string& name() const { return _name; } void regStats(); + void + incLoadVRFBankConflictCycles(int num_cycles) + { + loadVrfBankConflictCycles += num_cycles; + } + private: ComputeUnit *computeUnit; std::string _name; |