hsail, gpu-compute: remove doGm/SmReturn add completeAcc

we are removing doGmReturn from the GM pipe, and adding completeAcc() implementations for the HSAIL mem ops. the behavior in doGmReturn is dependent on HSAIL and HSAIL mem ops, however the completion phase of memory ops in machine ISA can be very different, even amongst individual machine ISA mem ops. so we remove this functionality from the pipeline and allow it to be implemented by the individual instructions.
author: Tony Gutierrez <anthony.gutierrez@amd.com> 2016-10-26 22:47:19 -0400
committer: Tony Gutierrez <anthony.gutierrez@amd.com> 2016-10-26 22:47:19 -0400
commit: 00a6346c91f6e09eb9c0f4cf61a7d72932a1282f (patch)
tree: 3ca7c5b2dedd8ed8151566b4d5abec95a06f4faf /src/gpu-compute/local_memory_pipeline.cc
parent: 7ac38849abaf6aeccf39137bc8acb9e44d192e82 (diff)
download: gem5-00a6346c91f6e09eb9c0f4cf61a7d72932a1282f.tar.xz
1 files changed, 27 insertions, 99 deletions
diff --git a/src/gpu-compute/local_memory_pipeline.cc b/src/gpu-compute/local_memory_pipeline.cc
index 80dad6fcd..9e7dc6fb3 100644
--- a/src/gpu-compute/local_memory_pipeline.cc
+++ b/src/gpu-compute/local_memory_pipeline.cc
@@ -62,11 +62,13 @@ LocalMemPipeline::exec()
         lmReturnedRequests.front() : nullptr;
 
     bool accessVrf = true;
+    Wavefront *w = nullptr;
+
     if ((m) && (m->isLoad() || m->isAtomicRet())) {
-        Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId];
+        w = m->wavefront();
 
         accessVrf =
-            w->computeUnit->vrf[m->simdId]->
+            w->computeUnit->vrf[w->simdId]->
             vrfOperandAccessReady(m->seqNum(), w, m,
                                   VrfAccessType::WRITE);
     }
@@ -74,44 +76,29 @@ LocalMemPipeline::exec()
     if (!lmReturnedRequests.empty() && m->latency.rdy() && accessVrf &&
         computeUnit->locMemToVrfBus.rdy() && (computeUnit->shader->coissue_return
                  || computeUnit->wfWait.at(m->pipeId).rdy())) {
-        if (m->v_type == VT_32 && m->m_type == Enums::M_U8)
-            doSmReturn<uint32_t, uint8_t>(m);
-        else if (m->v_type == VT_32 && m->m_type == Enums::M_U16)
-            doSmReturn<uint32_t, uint16_t>(m);
-        else if (m->v_type == VT_32 && m->m_type == Enums::M_U32)
-            doSmReturn<uint32_t, uint32_t>(m);
-        else if (m->v_type == VT_32 && m->m_type == Enums::M_S8)
-            doSmReturn<int32_t, int8_t>(m);
-        else if (m->v_type == VT_32 && m->m_type == Enums::M_S16)
-            doSmReturn<int32_t, int16_t>(m);
-        else if (m->v_type == VT_32 && m->m_type == Enums::M_S32)
-            doSmReturn<int32_t, int32_t>(m);
-        else if (m->v_type == VT_32 && m->m_type == Enums::M_F16)
-            doSmReturn<float, Float16>(m);
-        else if (m->v_type == VT_32 && m->m_type == Enums::M_F32)
-            doSmReturn<float, float>(m);
-        else if (m->v_type == VT_64 && m->m_type == Enums::M_U8)
-            doSmReturn<uint64_t, uint8_t>(m);
-        else if (m->v_type == VT_64 && m->m_type == Enums::M_U16)
-            doSmReturn<uint64_t, uint16_t>(m);
-        else if (m->v_type == VT_64 && m->m_type == Enums::M_U32)
-            doSmReturn<uint64_t, uint32_t>(m);
-        else if (m->v_type == VT_64 && m->m_type == Enums::M_U64)
-            doSmReturn<uint64_t, uint64_t>(m);
-        else if (m->v_type == VT_64 && m->m_type == Enums::M_S8)
-            doSmReturn<int64_t, int8_t>(m);
-        else if (m->v_type == VT_64 && m->m_type == Enums::M_S16)
-            doSmReturn<int64_t, int16_t>(m);
-        else if (m->v_type == VT_64 && m->m_type == Enums::M_S32)
-            doSmReturn<int64_t, int32_t>(m);
-        else if (m->v_type == VT_64 && m->m_type == Enums::M_S64)
-            doSmReturn<int64_t, int64_t>(m);
-        else if (m->v_type == VT_64 && m->m_type == Enums::M_F16)
-            doSmReturn<double, Float16>(m);
-        else if (m->v_type == VT_64 && m->m_type == Enums::M_F32)
-            doSmReturn<double, float>(m);
-        else if (m->v_type == VT_64 && m->m_type == Enums::M_F64)
-            doSmReturn<double, double>(m);
+
+        lmReturnedRequests.pop();
+        w = m->wavefront();
+
+        m->completeAcc(m);
+
+        // Decrement outstanding request count
+        computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);
+
+        if (m->isStore() || m->isAtomic()) {
+            computeUnit->shader->ScheduleAdd(&w->outstandingReqsWrLm,
+                                             m->time, -1);
+        }
+
+        if (m->isLoad() || m->isAtomic()) {
+            computeUnit->shader->ScheduleAdd(&w->outstandingReqsRdLm,
+                                             m->time, -1);
+        }
+
+        // Mark write bus busy for appropriate amount of time
+        computeUnit->locMemToVrfBus.set(m->time);
+        if (computeUnit->shader->coissue_return == 0)
+            w->computeUnit->wfWait.at(m->pipeId).set(m->time);
     }
 
     // If pipeline has executed a local memory instruction
@@ -129,65 +116,6 @@ LocalMemPipeline::exec()
     }
 }
 
-template<typename c0, typename c1>
-void
-LocalMemPipeline::doSmReturn(GPUDynInstPtr m)
-{
-    lmReturnedRequests.pop();
-    Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId];
-
-    // Return data to registers
-    if (m->isLoad() || m->isAtomicRet()) {
-        std::vector<uint32_t> regVec;
-        for (int k = 0; k < m->n_reg; ++k) {
-            int dst = m->dst_reg+k;
-
-            if (m->n_reg > MAX_REGS_FOR_NON_VEC_MEM_INST)
-                dst = m->dst_reg_vec[k];
-            // virtual->physical VGPR mapping
-            int physVgpr = w->remap(dst,sizeof(c0),1);
-            // save the physical VGPR index
-            regVec.push_back(physVgpr);
-            c1 *p1 = &((c1 *)m->d_data)[k * w->computeUnit->wfSize()];
-
-            for (int i = 0; i < w->computeUnit->wfSize(); ++i) {
-                if (m->exec_mask[i]) {
-                    // write the value into the physical VGPR. This is a purely
-                    // functional operation. No timing is modeled.
-                    w->computeUnit->vrf[w->simdId]->write<c0>(physVgpr,
-                                                                *p1, i);
-                }
-                ++p1;
-            }
-        }
-
-        // Schedule the write operation of the load data on the VRF. This simply
-        // models the timing aspect of the VRF write operation. It does not
-        // modify the physical VGPR.
-        loadVrfBankConflictCycles +=
-            w->computeUnit->vrf[w->simdId]->exec(m->seqNum(), w,
-                                                 regVec, sizeof(c0), m->time);
-    }
-
-    // Decrement outstanding request count
-    computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);
-
-    if (m->isStore() || m->isAtomic()) {
-        computeUnit->shader->ScheduleAdd(&w->outstandingReqsWrLm,
-                                         m->time, -1);
-    }
-
-    if (m->isLoad() || m->isAtomic()) {
-        computeUnit->shader->ScheduleAdd(&w->outstandingReqsRdLm,
-                                         m->time, -1);
-    }
-
-    // Mark write bus busy for appropriate amount of time
-    computeUnit->locMemToVrfBus.set(m->time);
-    if (computeUnit->shader->coissue_return == 0)
-        w->computeUnit->wfWait.at(m->pipeId).set(m->time);
-}
-
 void
 LocalMemPipeline::regStats()
 {
author	Tony Gutierrez <anthony.gutierrez@amd.com>	2016-10-26 22:47:19 -0400
committer	Tony Gutierrez <anthony.gutierrez@amd.com>	2016-10-26 22:47:19 -0400
commit	00a6346c91f6e09eb9c0f4cf61a7d72932a1282f (patch)
tree	3ca7c5b2dedd8ed8151566b4d5abec95a06f4faf /src/gpu-compute/local_memory_pipeline.cc
parent	7ac38849abaf6aeccf39137bc8acb9e44d192e82 (diff)
download	gem5-00a6346c91f6e09eb9c0f4cf61a7d72932a1282f.tar.xz