diff options
author | Tony Gutierrez <anthony.gutierrez@amd.com> | 2016-10-26 22:47:19 -0400 |
---|---|---|
committer | Tony Gutierrez <anthony.gutierrez@amd.com> | 2016-10-26 22:47:19 -0400 |
commit | 00a6346c91f6e09eb9c0f4cf61a7d72932a1282f (patch) | |
tree | 3ca7c5b2dedd8ed8151566b4d5abec95a06f4faf /src/gpu-compute/global_memory_pipeline.cc | |
parent | 7ac38849abaf6aeccf39137bc8acb9e44d192e82 (diff) | |
download | gem5-00a6346c91f6e09eb9c0f4cf61a7d72932a1282f.tar.xz |
hsail, gpu-compute: remove doGm/SmReturn add completeAcc
we are removing doGmReturn from the GM pipe, and adding completeAcc()
implementations for the HSAIL mem ops. the behavior in doGmReturn is
dependent on HSAIL and HSAIL mem ops, however the completion phase
of memory ops in machine ISA can be very different, even amongst individual
machine ISA mem ops. so we remove this functionality from the pipeline and
allow it to be implemented by the individual instructions.
Diffstat (limited to 'src/gpu-compute/global_memory_pipeline.cc')
-rw-r--r-- | src/gpu-compute/global_memory_pipeline.cc | 153 |
1 files changed, 36 insertions, 117 deletions
diff --git a/src/gpu-compute/global_memory_pipeline.cc b/src/gpu-compute/global_memory_pipeline.cc index ab3e8c47e..f48af5a6f 100644 --- a/src/gpu-compute/global_memory_pipeline.cc +++ b/src/gpu-compute/global_memory_pipeline.cc @@ -65,13 +65,15 @@ GlobalMemPipeline::exec() !gmReturnedStores.empty() ? gmReturnedStores.front() : nullptr; bool accessVrf = true; + Wavefront *w = nullptr; + // check the VRF to see if the operands of a load (or load component // of an atomic) are accessible if ((m) && (m->isLoad() || m->isAtomicRet())) { - Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId]; + w = m->wavefront(); accessVrf = - w->computeUnit->vrf[m->simdId]-> + w->computeUnit->vrf[w->simdId]-> vrfOperandAccessReady(m->seqNum(), w, m, VrfAccessType::WRITE); } @@ -82,44 +84,38 @@ GlobalMemPipeline::exec() (computeUnit->shader->coissue_return || computeUnit->wfWait.at(m->pipeId).rdy())) { - if (m->v_type == VT_32 && m->m_type == Enums::M_U8) - doGmReturn<uint32_t, uint8_t>(m); - else if (m->v_type == VT_32 && m->m_type == Enums::M_U16) - doGmReturn<uint32_t, uint16_t>(m); - else if (m->v_type == VT_32 && m->m_type == Enums::M_U32) - doGmReturn<uint32_t, uint32_t>(m); - else if (m->v_type == VT_32 && m->m_type == Enums::M_S8) - doGmReturn<int32_t, int8_t>(m); - else if (m->v_type == VT_32 && m->m_type == Enums::M_S16) - doGmReturn<int32_t, int16_t>(m); - else if (m->v_type == VT_32 && m->m_type == Enums::M_S32) - doGmReturn<int32_t, int32_t>(m); - else if (m->v_type == VT_32 && m->m_type == Enums::M_F16) - doGmReturn<float, Float16>(m); - else if (m->v_type == VT_32 && m->m_type == Enums::M_F32) - doGmReturn<float, float>(m); - else if (m->v_type == VT_64 && m->m_type == Enums::M_U8) - doGmReturn<uint64_t, uint8_t>(m); - else if (m->v_type == VT_64 && m->m_type == Enums::M_U16) - doGmReturn<uint64_t, uint16_t>(m); - else if (m->v_type == VT_64 && m->m_type == Enums::M_U32) - doGmReturn<uint64_t, uint32_t>(m); - else if (m->v_type == VT_64 && m->m_type == Enums::M_U64) - doGmReturn<uint64_t, uint64_t>(m); - else if (m->v_type == VT_64 && m->m_type == Enums::M_S8) - doGmReturn<int64_t, int8_t>(m); - else if (m->v_type == VT_64 && m->m_type == Enums::M_S16) - doGmReturn<int64_t, int16_t>(m); - else if (m->v_type == VT_64 && m->m_type == Enums::M_S32) - doGmReturn<int64_t, int32_t>(m); - else if (m->v_type == VT_64 && m->m_type == Enums::M_S64) - doGmReturn<int64_t, int64_t>(m); - else if (m->v_type == VT_64 && m->m_type == Enums::M_F16) - doGmReturn<double, Float16>(m); - else if (m->v_type == VT_64 && m->m_type == Enums::M_F32) - doGmReturn<double, float>(m); - else if (m->v_type == VT_64 && m->m_type == Enums::M_F64) - doGmReturn<double, double>(m); + w = m->wavefront(); + + m->completeAcc(m); + + if (m->isLoad() || m->isAtomic()) { + gmReturnedLoads.pop(); + assert(inflightLoads > 0); + --inflightLoads; + } else { + assert(m->isStore()); + gmReturnedStores.pop(); + assert(inflightStores > 0); + --inflightStores; + } + + // Decrement outstanding register count + computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1); + + if (m->isStore() || m->isAtomic()) { + computeUnit->shader->ScheduleAdd(&w->outstandingReqsWrGm, + m->time, -1); + } + + if (m->isLoad() || m->isAtomic()) { + computeUnit->shader->ScheduleAdd(&w->outstandingReqsRdGm, + m->time, -1); + } + + // Mark write bus busy for appropriate amount of time + computeUnit->glbMemToVrfBus.set(m->time); + if (!computeUnit->shader->coissue_return) + w->computeUnit->wfWait.at(m->pipeId).set(m->time); } // If pipeline has executed a global memory instruction @@ -149,83 +145,6 @@ GlobalMemPipeline::exec() } } -template<typename c0, typename c1> -void -GlobalMemPipeline::doGmReturn(GPUDynInstPtr m) -{ - Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId]; - - // Return data to registers - if (m->isLoad() || m->isAtomic()) { - gmReturnedLoads.pop(); - assert(inflightLoads > 0); - --inflightLoads; - - if (m->isLoad() || m->isAtomicRet()) { - std::vector<uint32_t> regVec; - // iterate over number of destination register operands since - // this is a load or atomic operation - for (int k = 0; k < m->n_reg; ++k) { - assert((sizeof(c1) * m->n_reg) <= MAX_WIDTH_FOR_MEM_INST); - int dst = m->dst_reg + k; - - if (m->n_reg > MAX_REGS_FOR_NON_VEC_MEM_INST) - dst = m->dst_reg_vec[k]; - // virtual->physical VGPR mapping - int physVgpr = w->remap(dst, sizeof(c0), 1); - // save the physical VGPR index - regVec.push_back(physVgpr); - c1 *p1 = &((c1 *)m->d_data)[k * w->computeUnit->wfSize()]; - - for (int i = 0; i < w->computeUnit->wfSize(); ++i) { - if (m->exec_mask[i]) { - DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: " - "$%s%d <- %d global ld done (src = wavefront " - "ld inst)\n", w->computeUnit->cu_id, w->simdId, - w->wfSlotId, i, sizeof(c0) == 4 ? "s" : "d", - dst, *p1); - // write the value into the physical VGPR. This is a - // purely functional operation. No timing is modeled. - w->computeUnit->vrf[w->simdId]->write<c0>(physVgpr, - *p1, i); - } - ++p1; - } - } - - // Schedule the write operation of the load data on the VRF. - // This simply models the timing aspect of the VRF write operation. - // It does not modify the physical VGPR. - loadVrfBankConflictCycles += - w->computeUnit->vrf[w->simdId]->exec(m->seqNum(), - w, regVec, sizeof(c0), - m->time); - } - } else { - gmReturnedStores.pop(); - assert(inflightStores > 0); - --inflightStores; - } - - // Decrement outstanding register count - computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1); - - if (m->isStore() || m->isAtomic()) { - computeUnit->shader->ScheduleAdd(&w->outstandingReqsWrGm, m->time, - -1); - } - - if (m->isLoad() || m->isAtomic()) { - computeUnit->shader->ScheduleAdd(&w->outstandingReqsRdGm, m->time, - -1); - } - - // Mark write bus busy for appropriate amount of time - computeUnit->glbMemToVrfBus.set(m->time); - if (!computeUnit->shader->coissue_return) - w->computeUnit->wfWait.at(m->pipeId).set(m->time); -} - void GlobalMemPipeline::regStats() { |