diff options
-rw-r--r-- | src/arch/hsail/insts/mem.hh | 140 | ||||
-rw-r--r-- | src/gpu-compute/global_memory_pipeline.cc | 153 | ||||
-rw-r--r-- | src/gpu-compute/global_memory_pipeline.hh | 8 | ||||
-rw-r--r-- | src/gpu-compute/gpu_dyn_inst.cc | 6 | ||||
-rw-r--r-- | src/gpu-compute/gpu_dyn_inst.hh | 4 | ||||
-rw-r--r-- | src/gpu-compute/local_memory_pipeline.cc | 126 | ||||
-rw-r--r-- | src/gpu-compute/local_memory_pipeline.hh | 8 |
7 files changed, 225 insertions, 220 deletions
diff --git a/src/arch/hsail/insts/mem.hh b/src/arch/hsail/insts/mem.hh index e223c7cf5..2e7dfcd1c 100644 --- a/src/arch/hsail/insts/mem.hh +++ b/src/arch/hsail/insts/mem.hh @@ -36,9 +36,12 @@ #ifndef __ARCH_HSAIL_INSTS_MEM_HH__ #define __ARCH_HSAIL_INSTS_MEM_HH__ +#include <type_traits> + #include "arch/hsail/insts/decl.hh" #include "arch/hsail/insts/gpu_static_inst.hh" #include "arch/hsail/operand.hh" +#include "gpu-compute/compute_unit.hh" namespace HsailISA { @@ -491,6 +494,86 @@ namespace HsailISA gpuDynInst->updateStats(); } + void + completeAcc(GPUDynInstPtr gpuDynInst) override + { + typedef typename MemDataType::CType c1; + + constexpr bool is_vt_32 = DestDataType::vgprType == VT_32; + + /** + * this code essentially replaces the long if-else chain + * that was in used GlobalMemPipeline::exec() to infer the + * size (single/double) and type (floating point/integer) of + * the destination register. this is needed for load + * instructions because the loaded value and the + * destination type can be of different sizes, and we also + * need to know if the value we're writing back is floating + * point and signed/unsigned, so we can properly cast the + * writeback value + */ + typedef typename std::conditional<is_vt_32, + typename std::conditional<std::is_floating_point<c1>::value, + float, typename std::conditional<std::is_signed<c1>::value, + int32_t, uint32_t>::type>::type, + typename std::conditional<std::is_floating_point<c1>::value, + double, typename std::conditional<std::is_signed<c1>::value, + int64_t, uint64_t>::type>::type>::type c0; + + + Wavefront *w = gpuDynInst->wavefront(); + + std::vector<uint32_t> regVec; + // iterate over number of destination register operands since + // this is a load + for (int k = 0; k < num_dest_operands; ++k) { + assert((sizeof(c1) * num_dest_operands) + <= MAX_WIDTH_FOR_MEM_INST); + + int dst = this->dest.regIndex() + k; + if (num_dest_operands > MAX_REGS_FOR_NON_VEC_MEM_INST) + dst = dest_vect[k].regIndex(); + // virtual->physical VGPR mapping + int physVgpr = w->remap(dst, sizeof(c0), 1); + // save the physical VGPR index + regVec.push_back(physVgpr); + + c1 *p1 = + &((c1*)gpuDynInst->d_data)[k * w->computeUnit->wfSize()]; + + for (int i = 0; i < w->computeUnit->wfSize(); ++i) { + if (gpuDynInst->exec_mask[i]) { + DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: " + "$%s%d <- %d global ld done (src = wavefront " + "ld inst)\n", w->computeUnit->cu_id, w->simdId, + w->wfSlotId, i, sizeof(c0) == 4 ? "s" : "d", + dst, *p1); + // write the value into the physical VGPR. This is a + // purely functional operation. No timing is modeled. + w->computeUnit->vrf[w->simdId]->write<c0>(physVgpr, + *p1, i); + } + ++p1; + } + } + + // Schedule the write operation of the load data on the VRF. + // This simply models the timing aspect of the VRF write operation. + // It does not modify the physical VGPR. + int loadVrfBankConflictCycles = gpuDynInst->computeUnit()-> + vrf[w->simdId]->exec(gpuDynInst->seqNum(), w, regVec, + sizeof(c0), gpuDynInst->time); + + if (this->isGlobalMem()) { + gpuDynInst->computeUnit()->globalMemoryPipe + .incLoadVRFBankConflictCycles(loadVrfBankConflictCycles); + } else { + assert(this->isLocalMem()); + gpuDynInst->computeUnit()->localMemoryPipe + .incLoadVRFBankConflictCycles(loadVrfBankConflictCycles); + } + } + private: void execLdAcq(GPUDynInstPtr gpuDynInst) override @@ -941,6 +1024,11 @@ namespace HsailISA execSt(gpuDynInst); } + // stores don't write anything back, so there is nothing + // to do here. we only override this method to avoid the + // fatal in the base class implementation + void completeAcc(GPUDynInstPtr gpuDynInst) override { } + private: // execSt may be called through a continuation // if the store had release semantics. see comment for @@ -1409,6 +1497,58 @@ namespace HsailISA } + void + completeAcc(GPUDynInstPtr gpuDynInst) override + { + // if this is not an atomic return op, then we + // have nothing more to do. + if (this->isAtomicRet()) { + // the size of the src operands and the + // memory being operated on must match + // for HSAIL atomics - this assumption may + // not apply to all ISAs + typedef typename MemDataType::CType CType; + + Wavefront *w = gpuDynInst->wavefront(); + int dst = this->dest.regIndex(); + std::vector<uint32_t> regVec; + // virtual->physical VGPR mapping + int physVgpr = w->remap(dst, sizeof(CType), 1); + regVec.push_back(physVgpr); + CType *p1 = &((CType*)gpuDynInst->d_data)[0]; + + for (int i = 0; i < w->computeUnit->wfSize(); ++i) { + if (gpuDynInst->exec_mask[i]) { + DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: " + "$%s%d <- %d global ld done (src = wavefront " + "ld inst)\n", w->computeUnit->cu_id, w->simdId, + w->wfSlotId, i, sizeof(CType) == 4 ? "s" : "d", + dst, *p1); + // write the value into the physical VGPR. This is a + // purely functional operation. No timing is modeled. + w->computeUnit->vrf[w->simdId]->write<CType>(physVgpr, *p1, i); + } + ++p1; + } + + // Schedule the write operation of the load data on the VRF. + // This simply models the timing aspect of the VRF write operation. + // It does not modify the physical VGPR. + int loadVrfBankConflictCycles = gpuDynInst->computeUnit()-> + vrf[w->simdId]->exec(gpuDynInst->seqNum(), w, regVec, + sizeof(CType), gpuDynInst->time); + + if (this->isGlobalMem()) { + gpuDynInst->computeUnit()->globalMemoryPipe + .incLoadVRFBankConflictCycles(loadVrfBankConflictCycles); + } else { + assert(this->isLocalMem()); + gpuDynInst->computeUnit()->localMemoryPipe + .incLoadVRFBankConflictCycles(loadVrfBankConflictCycles); + } + } + } + void execute(GPUDynInstPtr gpuDynInst) override; private: diff --git a/src/gpu-compute/global_memory_pipeline.cc b/src/gpu-compute/global_memory_pipeline.cc index ab3e8c47e..f48af5a6f 100644 --- a/src/gpu-compute/global_memory_pipeline.cc +++ b/src/gpu-compute/global_memory_pipeline.cc @@ -65,13 +65,15 @@ GlobalMemPipeline::exec() !gmReturnedStores.empty() ? gmReturnedStores.front() : nullptr; bool accessVrf = true; + Wavefront *w = nullptr; + // check the VRF to see if the operands of a load (or load component // of an atomic) are accessible if ((m) && (m->isLoad() || m->isAtomicRet())) { - Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId]; + w = m->wavefront(); accessVrf = - w->computeUnit->vrf[m->simdId]-> + w->computeUnit->vrf[w->simdId]-> vrfOperandAccessReady(m->seqNum(), w, m, VrfAccessType::WRITE); } @@ -82,44 +84,38 @@ GlobalMemPipeline::exec() (computeUnit->shader->coissue_return || computeUnit->wfWait.at(m->pipeId).rdy())) { - if (m->v_type == VT_32 && m->m_type == Enums::M_U8) - doGmReturn<uint32_t, uint8_t>(m); - else if (m->v_type == VT_32 && m->m_type == Enums::M_U16) - doGmReturn<uint32_t, uint16_t>(m); - else if (m->v_type == VT_32 && m->m_type == Enums::M_U32) - doGmReturn<uint32_t, uint32_t>(m); - else if (m->v_type == VT_32 && m->m_type == Enums::M_S8) - doGmReturn<int32_t, int8_t>(m); - else if (m->v_type == VT_32 && m->m_type == Enums::M_S16) - doGmReturn<int32_t, int16_t>(m); - else if (m->v_type == VT_32 && m->m_type == Enums::M_S32) - doGmReturn<int32_t, int32_t>(m); - else if (m->v_type == VT_32 && m->m_type == Enums::M_F16) - doGmReturn<float, Float16>(m); - else if (m->v_type == VT_32 && m->m_type == Enums::M_F32) - doGmReturn<float, float>(m); - else if (m->v_type == VT_64 && m->m_type == Enums::M_U8) - doGmReturn<uint64_t, uint8_t>(m); - else if (m->v_type == VT_64 && m->m_type == Enums::M_U16) - doGmReturn<uint64_t, uint16_t>(m); - else if (m->v_type == VT_64 && m->m_type == Enums::M_U32) - doGmReturn<uint64_t, uint32_t>(m); - else if (m->v_type == VT_64 && m->m_type == Enums::M_U64) - doGmReturn<uint64_t, uint64_t>(m); - else if (m->v_type == VT_64 && m->m_type == Enums::M_S8) - doGmReturn<int64_t, int8_t>(m); - else if (m->v_type == VT_64 && m->m_type == Enums::M_S16) - doGmReturn<int64_t, int16_t>(m); - else if (m->v_type == VT_64 && m->m_type == Enums::M_S32) - doGmReturn<int64_t, int32_t>(m); - else if (m->v_type == VT_64 && m->m_type == Enums::M_S64) - doGmReturn<int64_t, int64_t>(m); - else if (m->v_type == VT_64 && m->m_type == Enums::M_F16) - doGmReturn<double, Float16>(m); - else if (m->v_type == VT_64 && m->m_type == Enums::M_F32) - doGmReturn<double, float>(m); - else if (m->v_type == VT_64 && m->m_type == Enums::M_F64) - doGmReturn<double, double>(m); + w = m->wavefront(); + + m->completeAcc(m); + + if (m->isLoad() || m->isAtomic()) { + gmReturnedLoads.pop(); + assert(inflightLoads > 0); + --inflightLoads; + } else { + assert(m->isStore()); + gmReturnedStores.pop(); + assert(inflightStores > 0); + --inflightStores; + } + + // Decrement outstanding register count + computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1); + + if (m->isStore() || m->isAtomic()) { + computeUnit->shader->ScheduleAdd(&w->outstandingReqsWrGm, + m->time, -1); + } + + if (m->isLoad() || m->isAtomic()) { + computeUnit->shader->ScheduleAdd(&w->outstandingReqsRdGm, + m->time, -1); + } + + // Mark write bus busy for appropriate amount of time + computeUnit->glbMemToVrfBus.set(m->time); + if (!computeUnit->shader->coissue_return) + w->computeUnit->wfWait.at(m->pipeId).set(m->time); } // If pipeline has executed a global memory instruction @@ -149,83 +145,6 @@ GlobalMemPipeline::exec() } } -template<typename c0, typename c1> -void -GlobalMemPipeline::doGmReturn(GPUDynInstPtr m) -{ - Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId]; - - // Return data to registers - if (m->isLoad() || m->isAtomic()) { - gmReturnedLoads.pop(); - assert(inflightLoads > 0); - --inflightLoads; - - if (m->isLoad() || m->isAtomicRet()) { - std::vector<uint32_t> regVec; - // iterate over number of destination register operands since - // this is a load or atomic operation - for (int k = 0; k < m->n_reg; ++k) { - assert((sizeof(c1) * m->n_reg) <= MAX_WIDTH_FOR_MEM_INST); - int dst = m->dst_reg + k; - - if (m->n_reg > MAX_REGS_FOR_NON_VEC_MEM_INST) - dst = m->dst_reg_vec[k]; - // virtual->physical VGPR mapping - int physVgpr = w->remap(dst, sizeof(c0), 1); - // save the physical VGPR index - regVec.push_back(physVgpr); - c1 *p1 = &((c1 *)m->d_data)[k * w->computeUnit->wfSize()]; - - for (int i = 0; i < w->computeUnit->wfSize(); ++i) { - if (m->exec_mask[i]) { - DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: " - "$%s%d <- %d global ld done (src = wavefront " - "ld inst)\n", w->computeUnit->cu_id, w->simdId, - w->wfSlotId, i, sizeof(c0) == 4 ? "s" : "d", - dst, *p1); - // write the value into the physical VGPR. This is a - // purely functional operation. No timing is modeled. - w->computeUnit->vrf[w->simdId]->write<c0>(physVgpr, - *p1, i); - } - ++p1; - } - } - - // Schedule the write operation of the load data on the VRF. - // This simply models the timing aspect of the VRF write operation. - // It does not modify the physical VGPR. - loadVrfBankConflictCycles += - w->computeUnit->vrf[w->simdId]->exec(m->seqNum(), - w, regVec, sizeof(c0), - m->time); - } - } else { - gmReturnedStores.pop(); - assert(inflightStores > 0); - --inflightStores; - } - - // Decrement outstanding register count - computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1); - - if (m->isStore() || m->isAtomic()) { - computeUnit->shader->ScheduleAdd(&w->outstandingReqsWrGm, m->time, - -1); - } - - if (m->isLoad() || m->isAtomic()) { - computeUnit->shader->ScheduleAdd(&w->outstandingReqsRdGm, m->time, - -1); - } - - // Mark write bus busy for appropriate amount of time - computeUnit->glbMemToVrfBus.set(m->time); - if (!computeUnit->shader->coissue_return) - w->computeUnit->wfWait.at(m->pipeId).set(m->time); -} - void GlobalMemPipeline::regStats() { diff --git a/src/gpu-compute/global_memory_pipeline.hh b/src/gpu-compute/global_memory_pipeline.hh index ed49f6f6b..368a15079 100644 --- a/src/gpu-compute/global_memory_pipeline.hh +++ b/src/gpu-compute/global_memory_pipeline.hh @@ -62,8 +62,6 @@ class GlobalMemPipeline void init(ComputeUnit *cu); void exec(); - template<typename c0, typename c1> void doGmReturn(GPUDynInstPtr m); - std::queue<GPUDynInstPtr> &getGMReqFIFO() { return gmIssuedRequests; } std::queue<GPUDynInstPtr> &getGMStRespFIFO() { return gmReturnedStores; } std::queue<GPUDynInstPtr> &getGMLdRespFIFO() { return gmReturnedLoads; } @@ -89,6 +87,12 @@ class GlobalMemPipeline const std::string &name() const { return _name; } void regStats(); + void + incLoadVRFBankConflictCycles(int num_cycles) + { + loadVrfBankConflictCycles += num_cycles; + } + private: ComputeUnit *computeUnit; std::string _name; diff --git a/src/gpu-compute/gpu_dyn_inst.cc b/src/gpu-compute/gpu_dyn_inst.cc index ec6340360..7092a7a40 100644 --- a/src/gpu-compute/gpu_dyn_inst.cc +++ b/src/gpu-compute/gpu_dyn_inst.cc @@ -155,6 +155,12 @@ GPUDynInst::initiateAcc(GPUDynInstPtr gpuDynInst) time = 0; } +void +GPUDynInst::completeAcc(GPUDynInstPtr gpuDynInst) +{ + _staticInst->completeAcc(gpuDynInst); +} + /** * accessor methods for the attributes of * the underlying GPU static instruction diff --git a/src/gpu-compute/gpu_dyn_inst.hh b/src/gpu-compute/gpu_dyn_inst.hh index c07d85d78..527b87b4c 100644 --- a/src/gpu-compute/gpu_dyn_inst.hh +++ b/src/gpu-compute/gpu_dyn_inst.hh @@ -258,6 +258,10 @@ class GPUDynInst : public GPUExecContext // Initiate the specified memory operation, by creating a // memory request and sending it off to the memory system. void initiateAcc(GPUDynInstPtr gpuDynInst); + // Complete the specified memory operation, by writing + // value back to the RF in the case of a load or atomic + // return or, in the case of a store, we do nothing + void completeAcc(GPUDynInstPtr gpuDynInst); void updateStats(); diff --git a/src/gpu-compute/local_memory_pipeline.cc b/src/gpu-compute/local_memory_pipeline.cc index 80dad6fcd..9e7dc6fb3 100644 --- a/src/gpu-compute/local_memory_pipeline.cc +++ b/src/gpu-compute/local_memory_pipeline.cc @@ -62,11 +62,13 @@ LocalMemPipeline::exec() lmReturnedRequests.front() : nullptr; bool accessVrf = true; + Wavefront *w = nullptr; + if ((m) && (m->isLoad() || m->isAtomicRet())) { - Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId]; + w = m->wavefront(); accessVrf = - w->computeUnit->vrf[m->simdId]-> + w->computeUnit->vrf[w->simdId]-> vrfOperandAccessReady(m->seqNum(), w, m, VrfAccessType::WRITE); } @@ -74,44 +76,29 @@ LocalMemPipeline::exec() if (!lmReturnedRequests.empty() && m->latency.rdy() && accessVrf && computeUnit->locMemToVrfBus.rdy() && (computeUnit->shader->coissue_return || computeUnit->wfWait.at(m->pipeId).rdy())) { - if (m->v_type == VT_32 && m->m_type == Enums::M_U8) - doSmReturn<uint32_t, uint8_t>(m); - else if (m->v_type == VT_32 && m->m_type == Enums::M_U16) - doSmReturn<uint32_t, uint16_t>(m); - else if (m->v_type == VT_32 && m->m_type == Enums::M_U32) - doSmReturn<uint32_t, uint32_t>(m); - else if (m->v_type == VT_32 && m->m_type == Enums::M_S8) - doSmReturn<int32_t, int8_t>(m); - else if (m->v_type == VT_32 && m->m_type == Enums::M_S16) - doSmReturn<int32_t, int16_t>(m); - else if (m->v_type == VT_32 && m->m_type == Enums::M_S32) - doSmReturn<int32_t, int32_t>(m); - else if (m->v_type == VT_32 && m->m_type == Enums::M_F16) - doSmReturn<float, Float16>(m); - else if (m->v_type == VT_32 && m->m_type == Enums::M_F32) - doSmReturn<float, float>(m); - else if (m->v_type == VT_64 && m->m_type == Enums::M_U8) - doSmReturn<uint64_t, uint8_t>(m); - else if (m->v_type == VT_64 && m->m_type == Enums::M_U16) - doSmReturn<uint64_t, uint16_t>(m); - else if (m->v_type == VT_64 && m->m_type == Enums::M_U32) - doSmReturn<uint64_t, uint32_t>(m); - else if (m->v_type == VT_64 && m->m_type == Enums::M_U64) - doSmReturn<uint64_t, uint64_t>(m); - else if (m->v_type == VT_64 && m->m_type == Enums::M_S8) - doSmReturn<int64_t, int8_t>(m); - else if (m->v_type == VT_64 && m->m_type == Enums::M_S16) - doSmReturn<int64_t, int16_t>(m); - else if (m->v_type == VT_64 && m->m_type == Enums::M_S32) - doSmReturn<int64_t, int32_t>(m); - else if (m->v_type == VT_64 && m->m_type == Enums::M_S64) - doSmReturn<int64_t, int64_t>(m); - else if (m->v_type == VT_64 && m->m_type == Enums::M_F16) - doSmReturn<double, Float16>(m); - else if (m->v_type == VT_64 && m->m_type == Enums::M_F32) - doSmReturn<double, float>(m); - else if (m->v_type == VT_64 && m->m_type == Enums::M_F64) - doSmReturn<double, double>(m); + + lmReturnedRequests.pop(); + w = m->wavefront(); + + m->completeAcc(m); + + // Decrement outstanding request count + computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1); + + if (m->isStore() || m->isAtomic()) { + computeUnit->shader->ScheduleAdd(&w->outstandingReqsWrLm, + m->time, -1); + } + + if (m->isLoad() || m->isAtomic()) { + computeUnit->shader->ScheduleAdd(&w->outstandingReqsRdLm, + m->time, -1); + } + + // Mark write bus busy for appropriate amount of time + computeUnit->locMemToVrfBus.set(m->time); + if (computeUnit->shader->coissue_return == 0) + w->computeUnit->wfWait.at(m->pipeId).set(m->time); } // If pipeline has executed a local memory instruction @@ -129,65 +116,6 @@ LocalMemPipeline::exec() } } -template<typename c0, typename c1> -void -LocalMemPipeline::doSmReturn(GPUDynInstPtr m) -{ - lmReturnedRequests.pop(); - Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId]; - - // Return data to registers - if (m->isLoad() || m->isAtomicRet()) { - std::vector<uint32_t> regVec; - for (int k = 0; k < m->n_reg; ++k) { - int dst = m->dst_reg+k; - - if (m->n_reg > MAX_REGS_FOR_NON_VEC_MEM_INST) - dst = m->dst_reg_vec[k]; - // virtual->physical VGPR mapping - int physVgpr = w->remap(dst,sizeof(c0),1); - // save the physical VGPR index - regVec.push_back(physVgpr); - c1 *p1 = &((c1 *)m->d_data)[k * w->computeUnit->wfSize()]; - - for (int i = 0; i < w->computeUnit->wfSize(); ++i) { - if (m->exec_mask[i]) { - // write the value into the physical VGPR. This is a purely - // functional operation. No timing is modeled. - w->computeUnit->vrf[w->simdId]->write<c0>(physVgpr, - *p1, i); - } - ++p1; - } - } - - // Schedule the write operation of the load data on the VRF. This simply - // models the timing aspect of the VRF write operation. It does not - // modify the physical VGPR. - loadVrfBankConflictCycles += - w->computeUnit->vrf[w->simdId]->exec(m->seqNum(), w, - regVec, sizeof(c0), m->time); - } - - // Decrement outstanding request count - computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1); - - if (m->isStore() || m->isAtomic()) { - computeUnit->shader->ScheduleAdd(&w->outstandingReqsWrLm, - m->time, -1); - } - - if (m->isLoad() || m->isAtomic()) { - computeUnit->shader->ScheduleAdd(&w->outstandingReqsRdLm, - m->time, -1); - } - - // Mark write bus busy for appropriate amount of time - computeUnit->locMemToVrfBus.set(m->time); - if (computeUnit->shader->coissue_return == 0) - w->computeUnit->wfWait.at(m->pipeId).set(m->time); -} - void LocalMemPipeline::regStats() { diff --git a/src/gpu-compute/local_memory_pipeline.hh b/src/gpu-compute/local_memory_pipeline.hh index a63d867d0..e0a21fd82 100644 --- a/src/gpu-compute/local_memory_pipeline.hh +++ b/src/gpu-compute/local_memory_pipeline.hh @@ -61,8 +61,6 @@ class LocalMemPipeline void init(ComputeUnit *cu); void exec(); - template<typename c0, typename c1> void doSmReturn(GPUDynInstPtr m); - std::queue<GPUDynInstPtr> &getLMReqFIFO() { return lmIssuedRequests; } std::queue<GPUDynInstPtr> &getLMRespFIFO() { return lmReturnedRequests; } @@ -81,6 +79,12 @@ class LocalMemPipeline const std::string& name() const { return _name; } void regStats(); + void + incLoadVRFBankConflictCycles(int num_cycles) + { + loadVrfBankConflictCycles += num_cycles; + } + private: ComputeUnit *computeUnit; std::string _name; |