From 00a6346c91f6e09eb9c0f4cf61a7d72932a1282f Mon Sep 17 00:00:00 2001
From: Tony Gutierrez <anthony.gutierrez@amd.com>
Date: Wed, 26 Oct 2016 22:47:19 -0400
Subject: hsail, gpu-compute: remove doGm/SmReturn add completeAcc

we are removing doGmReturn from the GM pipe, and adding completeAcc()
implementations for the HSAIL mem ops. the behavior in doGmReturn is
dependent on HSAIL and HSAIL mem ops, however the completion phase
of memory ops in machine ISA can be very different, even amongst individual
machine ISA mem ops. so we remove this functionality from the pipeline and
allow it to be implemented by the individual instructions.
---
 src/gpu-compute/global_memory_pipeline.cc | 153 +++++++-----------------------
 src/gpu-compute/global_memory_pipeline.hh |   8 +-
 src/gpu-compute/gpu_dyn_inst.cc           |   6 ++
 src/gpu-compute/gpu_dyn_inst.hh           |   4 +
 src/gpu-compute/local_memory_pipeline.cc  | 126 ++++++------------------
 src/gpu-compute/local_memory_pipeline.hh  |   8 +-
 6 files changed, 85 insertions(+), 220 deletions(-)

(limited to 'src/gpu-compute')

diff --git a/src/gpu-compute/global_memory_pipeline.cc b/src/gpu-compute/global_memory_pipeline.cc
index ab3e8c47e..f48af5a6f 100644
--- a/src/gpu-compute/global_memory_pipeline.cc
+++ b/src/gpu-compute/global_memory_pipeline.cc
@@ -65,13 +65,15 @@ GlobalMemPipeline::exec()
         !gmReturnedStores.empty() ? gmReturnedStores.front() : nullptr;
 
     bool accessVrf = true;
+    Wavefront *w = nullptr;
+
     // check the VRF to see if the operands of a load (or load component
     // of an atomic) are accessible
     if ((m) && (m->isLoad() || m->isAtomicRet())) {
-        Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId];
+        w = m->wavefront();
 
         accessVrf =
-            w->computeUnit->vrf[m->simdId]->
+            w->computeUnit->vrf[w->simdId]->
             vrfOperandAccessReady(m->seqNum(), w, m,
                                   VrfAccessType::WRITE);
     }
@@ -82,44 +84,38 @@ GlobalMemPipeline::exec()
         (computeUnit->shader->coissue_return ||
          computeUnit->wfWait.at(m->pipeId).rdy())) {
 
-        if (m->v_type == VT_32 && m->m_type == Enums::M_U8)
-            doGmReturn<uint32_t, uint8_t>(m);
-        else if (m->v_type == VT_32 && m->m_type == Enums::M_U16)
-            doGmReturn<uint32_t, uint16_t>(m);
-        else if (m->v_type == VT_32 && m->m_type == Enums::M_U32)
-            doGmReturn<uint32_t, uint32_t>(m);
-        else if (m->v_type == VT_32 && m->m_type == Enums::M_S8)
-            doGmReturn<int32_t, int8_t>(m);
-        else if (m->v_type == VT_32 && m->m_type == Enums::M_S16)
-            doGmReturn<int32_t, int16_t>(m);
-        else if (m->v_type == VT_32 && m->m_type == Enums::M_S32)
-            doGmReturn<int32_t, int32_t>(m);
-        else if (m->v_type == VT_32 && m->m_type == Enums::M_F16)
-            doGmReturn<float, Float16>(m);
-        else if (m->v_type == VT_32 && m->m_type == Enums::M_F32)
-            doGmReturn<float, float>(m);
-        else if (m->v_type == VT_64 && m->m_type == Enums::M_U8)
-            doGmReturn<uint64_t, uint8_t>(m);
-        else if (m->v_type == VT_64 && m->m_type == Enums::M_U16)
-            doGmReturn<uint64_t, uint16_t>(m);
-        else if (m->v_type == VT_64 && m->m_type == Enums::M_U32)
-            doGmReturn<uint64_t, uint32_t>(m);
-        else if (m->v_type == VT_64 && m->m_type == Enums::M_U64)
-            doGmReturn<uint64_t, uint64_t>(m);
-        else if (m->v_type == VT_64 && m->m_type == Enums::M_S8)
-            doGmReturn<int64_t, int8_t>(m);
-        else if (m->v_type == VT_64 && m->m_type == Enums::M_S16)
-            doGmReturn<int64_t, int16_t>(m);
-        else if (m->v_type == VT_64 && m->m_type == Enums::M_S32)
-            doGmReturn<int64_t, int32_t>(m);
-        else if (m->v_type == VT_64 && m->m_type == Enums::M_S64)
-            doGmReturn<int64_t, int64_t>(m);
-        else if (m->v_type == VT_64 && m->m_type == Enums::M_F16)
-            doGmReturn<double, Float16>(m);
-        else if (m->v_type == VT_64 && m->m_type == Enums::M_F32)
-            doGmReturn<double, float>(m);
-        else if (m->v_type == VT_64 && m->m_type == Enums::M_F64)
-            doGmReturn<double, double>(m);
+        w = m->wavefront();
+
+        m->completeAcc(m);
+
+        if (m->isLoad() || m->isAtomic()) {
+            gmReturnedLoads.pop();
+            assert(inflightLoads > 0);
+            --inflightLoads;
+        } else {
+            assert(m->isStore());
+            gmReturnedStores.pop();
+            assert(inflightStores > 0);
+            --inflightStores;
+        }
+
+        // Decrement outstanding register count
+        computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);
+
+        if (m->isStore() || m->isAtomic()) {
+            computeUnit->shader->ScheduleAdd(&w->outstandingReqsWrGm,
+                                             m->time, -1);
+        }
+
+        if (m->isLoad() || m->isAtomic()) {
+            computeUnit->shader->ScheduleAdd(&w->outstandingReqsRdGm,
+                                             m->time, -1);
+        }
+
+        // Mark write bus busy for appropriate amount of time
+        computeUnit->glbMemToVrfBus.set(m->time);
+        if (!computeUnit->shader->coissue_return)
+            w->computeUnit->wfWait.at(m->pipeId).set(m->time);
     }
 
     // If pipeline has executed a global memory instruction
@@ -149,83 +145,6 @@ GlobalMemPipeline::exec()
     }
 }
 
-template<typename c0, typename c1>
-void
-GlobalMemPipeline::doGmReturn(GPUDynInstPtr m)
-{
-    Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId];
-
-    // Return data to registers
-    if (m->isLoad() || m->isAtomic()) {
-        gmReturnedLoads.pop();
-        assert(inflightLoads > 0);
-        --inflightLoads;
-
-        if (m->isLoad() || m->isAtomicRet()) {
-            std::vector<uint32_t> regVec;
-            // iterate over number of destination register operands since
-            // this is a load or atomic operation
-            for (int k = 0; k < m->n_reg; ++k) {
-                assert((sizeof(c1) * m->n_reg) <= MAX_WIDTH_FOR_MEM_INST);
-                int dst = m->dst_reg + k;
-
-                if (m->n_reg > MAX_REGS_FOR_NON_VEC_MEM_INST)
-                    dst = m->dst_reg_vec[k];
-                // virtual->physical VGPR mapping
-                int physVgpr = w->remap(dst, sizeof(c0), 1);
-                // save the physical VGPR index
-                regVec.push_back(physVgpr);
-                c1 *p1 = &((c1 *)m->d_data)[k * w->computeUnit->wfSize()];
-
-                for (int i = 0; i < w->computeUnit->wfSize(); ++i) {
-                    if (m->exec_mask[i]) {
-                        DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: "
-                                "$%s%d <- %d global ld done (src = wavefront "
-                                "ld inst)\n", w->computeUnit->cu_id, w->simdId,
-                                w->wfSlotId, i, sizeof(c0) == 4 ? "s" : "d",
-                                dst, *p1);
-                        // write the value into the physical VGPR. This is a
-                        // purely functional operation. No timing is modeled.
-                        w->computeUnit->vrf[w->simdId]->write<c0>(physVgpr,
-                                                                    *p1, i);
-                    }
-                    ++p1;
-                }
-            }
-
-            // Schedule the write operation of the load data on the VRF.
-            // This simply models the timing aspect of the VRF write operation.
-            // It does not modify the physical VGPR.
-            loadVrfBankConflictCycles +=
-                w->computeUnit->vrf[w->simdId]->exec(m->seqNum(),
-                                                     w, regVec, sizeof(c0),
-                                                     m->time);
-        }
-    } else {
-        gmReturnedStores.pop();
-        assert(inflightStores > 0);
-        --inflightStores;
-    }
-
-    // Decrement outstanding register count
-    computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);
-
-    if (m->isStore() || m->isAtomic()) {
-        computeUnit->shader->ScheduleAdd(&w->outstandingReqsWrGm, m->time,
-                                         -1);
-    }
-
-    if (m->isLoad() || m->isAtomic()) {
-        computeUnit->shader->ScheduleAdd(&w->outstandingReqsRdGm, m->time,
-                                         -1);
-    }
-
-    // Mark write bus busy for appropriate amount of time
-    computeUnit->glbMemToVrfBus.set(m->time);
-    if (!computeUnit->shader->coissue_return)
-        w->computeUnit->wfWait.at(m->pipeId).set(m->time);
-}
-
 void
 GlobalMemPipeline::regStats()
 {
diff --git a/src/gpu-compute/global_memory_pipeline.hh b/src/gpu-compute/global_memory_pipeline.hh
index ed49f6f6b..368a15079 100644
--- a/src/gpu-compute/global_memory_pipeline.hh
+++ b/src/gpu-compute/global_memory_pipeline.hh
@@ -62,8 +62,6 @@ class GlobalMemPipeline
     void init(ComputeUnit *cu);
     void exec();
 
-    template<typename c0, typename c1> void doGmReturn(GPUDynInstPtr m);
-
     std::queue<GPUDynInstPtr> &getGMReqFIFO() { return gmIssuedRequests; }
     std::queue<GPUDynInstPtr> &getGMStRespFIFO() { return gmReturnedStores; }
     std::queue<GPUDynInstPtr> &getGMLdRespFIFO() { return gmReturnedLoads; }
@@ -89,6 +87,12 @@ class GlobalMemPipeline
     const std::string &name() const { return _name; }
     void regStats();
 
+    void
+    incLoadVRFBankConflictCycles(int num_cycles)
+    {
+        loadVrfBankConflictCycles += num_cycles;
+    }
+
   private:
     ComputeUnit *computeUnit;
     std::string _name;
diff --git a/src/gpu-compute/gpu_dyn_inst.cc b/src/gpu-compute/gpu_dyn_inst.cc
index ec6340360..7092a7a40 100644
--- a/src/gpu-compute/gpu_dyn_inst.cc
+++ b/src/gpu-compute/gpu_dyn_inst.cc
@@ -155,6 +155,12 @@ GPUDynInst::initiateAcc(GPUDynInstPtr gpuDynInst)
     time = 0;
 }
 
+void
+GPUDynInst::completeAcc(GPUDynInstPtr gpuDynInst)
+{
+    _staticInst->completeAcc(gpuDynInst);
+}
+
 /**
  * accessor methods for the attributes of
  * the underlying GPU static instruction
diff --git a/src/gpu-compute/gpu_dyn_inst.hh b/src/gpu-compute/gpu_dyn_inst.hh
index c07d85d78..527b87b4c 100644
--- a/src/gpu-compute/gpu_dyn_inst.hh
+++ b/src/gpu-compute/gpu_dyn_inst.hh
@@ -258,6 +258,10 @@ class GPUDynInst : public GPUExecContext
     // Initiate the specified memory operation, by creating a
     // memory request and sending it off to the memory system.
     void initiateAcc(GPUDynInstPtr gpuDynInst);
+    // Complete the specified memory operation, by writing
+    // value back to the RF in the case of a load or atomic
+    // return or, in the case of a store, we do nothing
+    void completeAcc(GPUDynInstPtr gpuDynInst);
 
     void updateStats();
 
diff --git a/src/gpu-compute/local_memory_pipeline.cc b/src/gpu-compute/local_memory_pipeline.cc
index 80dad6fcd..9e7dc6fb3 100644
--- a/src/gpu-compute/local_memory_pipeline.cc
+++ b/src/gpu-compute/local_memory_pipeline.cc
@@ -62,11 +62,13 @@ LocalMemPipeline::exec()
         lmReturnedRequests.front() : nullptr;
 
     bool accessVrf = true;
+    Wavefront *w = nullptr;
+
     if ((m) && (m->isLoad() || m->isAtomicRet())) {
-        Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId];
+        w = m->wavefront();
 
         accessVrf =
-            w->computeUnit->vrf[m->simdId]->
+            w->computeUnit->vrf[w->simdId]->
             vrfOperandAccessReady(m->seqNum(), w, m,
                                   VrfAccessType::WRITE);
     }
@@ -74,44 +76,29 @@ LocalMemPipeline::exec()
     if (!lmReturnedRequests.empty() && m->latency.rdy() && accessVrf &&
         computeUnit->locMemToVrfBus.rdy() && (computeUnit->shader->coissue_return
                  || computeUnit->wfWait.at(m->pipeId).rdy())) {
-        if (m->v_type == VT_32 && m->m_type == Enums::M_U8)
-            doSmReturn<uint32_t, uint8_t>(m);
-        else if (m->v_type == VT_32 && m->m_type == Enums::M_U16)
-            doSmReturn<uint32_t, uint16_t>(m);
-        else if (m->v_type == VT_32 && m->m_type == Enums::M_U32)
-            doSmReturn<uint32_t, uint32_t>(m);
-        else if (m->v_type == VT_32 && m->m_type == Enums::M_S8)
-            doSmReturn<int32_t, int8_t>(m);
-        else if (m->v_type == VT_32 && m->m_type == Enums::M_S16)
-            doSmReturn<int32_t, int16_t>(m);
-        else if (m->v_type == VT_32 && m->m_type == Enums::M_S32)
-            doSmReturn<int32_t, int32_t>(m);
-        else if (m->v_type == VT_32 && m->m_type == Enums::M_F16)
-            doSmReturn<float, Float16>(m);
-        else if (m->v_type == VT_32 && m->m_type == Enums::M_F32)
-            doSmReturn<float, float>(m);
-        else if (m->v_type == VT_64 && m->m_type == Enums::M_U8)
-            doSmReturn<uint64_t, uint8_t>(m);
-        else if (m->v_type == VT_64 && m->m_type == Enums::M_U16)
-            doSmReturn<uint64_t, uint16_t>(m);
-        else if (m->v_type == VT_64 && m->m_type == Enums::M_U32)
-            doSmReturn<uint64_t, uint32_t>(m);
-        else if (m->v_type == VT_64 && m->m_type == Enums::M_U64)
-            doSmReturn<uint64_t, uint64_t>(m);
-        else if (m->v_type == VT_64 && m->m_type == Enums::M_S8)
-            doSmReturn<int64_t, int8_t>(m);
-        else if (m->v_type == VT_64 && m->m_type == Enums::M_S16)
-            doSmReturn<int64_t, int16_t>(m);
-        else if (m->v_type == VT_64 && m->m_type == Enums::M_S32)
-            doSmReturn<int64_t, int32_t>(m);
-        else if (m->v_type == VT_64 && m->m_type == Enums::M_S64)
-            doSmReturn<int64_t, int64_t>(m);
-        else if (m->v_type == VT_64 && m->m_type == Enums::M_F16)
-            doSmReturn<double, Float16>(m);
-        else if (m->v_type == VT_64 && m->m_type == Enums::M_F32)
-            doSmReturn<double, float>(m);
-        else if (m->v_type == VT_64 && m->m_type == Enums::M_F64)
-            doSmReturn<double, double>(m);
+
+        lmReturnedRequests.pop();
+        w = m->wavefront();
+
+        m->completeAcc(m);
+
+        // Decrement outstanding request count
+        computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);
+
+        if (m->isStore() || m->isAtomic()) {
+            computeUnit->shader->ScheduleAdd(&w->outstandingReqsWrLm,
+                                             m->time, -1);
+        }
+
+        if (m->isLoad() || m->isAtomic()) {
+            computeUnit->shader->ScheduleAdd(&w->outstandingReqsRdLm,
+                                             m->time, -1);
+        }
+
+        // Mark write bus busy for appropriate amount of time
+        computeUnit->locMemToVrfBus.set(m->time);
+        if (computeUnit->shader->coissue_return == 0)
+            w->computeUnit->wfWait.at(m->pipeId).set(m->time);
     }
 
     // If pipeline has executed a local memory instruction
@@ -129,65 +116,6 @@ LocalMemPipeline::exec()
     }
 }
 
-template<typename c0, typename c1>
-void
-LocalMemPipeline::doSmReturn(GPUDynInstPtr m)
-{
-    lmReturnedRequests.pop();
-    Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId];
-
-    // Return data to registers
-    if (m->isLoad() || m->isAtomicRet()) {
-        std::vector<uint32_t> regVec;
-        for (int k = 0; k < m->n_reg; ++k) {
-            int dst = m->dst_reg+k;
-
-            if (m->n_reg > MAX_REGS_FOR_NON_VEC_MEM_INST)
-                dst = m->dst_reg_vec[k];
-            // virtual->physical VGPR mapping
-            int physVgpr = w->remap(dst,sizeof(c0),1);
-            // save the physical VGPR index
-            regVec.push_back(physVgpr);
-            c1 *p1 = &((c1 *)m->d_data)[k * w->computeUnit->wfSize()];
-
-            for (int i = 0; i < w->computeUnit->wfSize(); ++i) {
-                if (m->exec_mask[i]) {
-                    // write the value into the physical VGPR. This is a purely
-                    // functional operation. No timing is modeled.
-                    w->computeUnit->vrf[w->simdId]->write<c0>(physVgpr,
-                                                                *p1, i);
-                }
-                ++p1;
-            }
-        }
-
-        // Schedule the write operation of the load data on the VRF. This simply
-        // models the timing aspect of the VRF write operation. It does not
-        // modify the physical VGPR.
-        loadVrfBankConflictCycles +=
-            w->computeUnit->vrf[w->simdId]->exec(m->seqNum(), w,
-                                                 regVec, sizeof(c0), m->time);
-    }
-
-    // Decrement outstanding request count
-    computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);
-
-    if (m->isStore() || m->isAtomic()) {
-        computeUnit->shader->ScheduleAdd(&w->outstandingReqsWrLm,
-                                         m->time, -1);
-    }
-
-    if (m->isLoad() || m->isAtomic()) {
-        computeUnit->shader->ScheduleAdd(&w->outstandingReqsRdLm,
-                                         m->time, -1);
-    }
-
-    // Mark write bus busy for appropriate amount of time
-    computeUnit->locMemToVrfBus.set(m->time);
-    if (computeUnit->shader->coissue_return == 0)
-        w->computeUnit->wfWait.at(m->pipeId).set(m->time);
-}
-
 void
 LocalMemPipeline::regStats()
 {
diff --git a/src/gpu-compute/local_memory_pipeline.hh b/src/gpu-compute/local_memory_pipeline.hh
index a63d867d0..e0a21fd82 100644
--- a/src/gpu-compute/local_memory_pipeline.hh
+++ b/src/gpu-compute/local_memory_pipeline.hh
@@ -61,8 +61,6 @@ class LocalMemPipeline
     void init(ComputeUnit *cu);
     void exec();
 
-    template<typename c0, typename c1> void doSmReturn(GPUDynInstPtr m);
-
     std::queue<GPUDynInstPtr> &getLMReqFIFO() { return lmIssuedRequests; }
     std::queue<GPUDynInstPtr> &getLMRespFIFO() { return lmReturnedRequests; }
 
@@ -81,6 +79,12 @@ class LocalMemPipeline
     const std::string& name() const { return _name; }
     void regStats();
 
+    void
+    incLoadVRFBankConflictCycles(int num_cycles)
+    {
+        loadVrfBankConflictCycles += num_cycles;
+    }
+
   private:
     ComputeUnit *computeUnit;
     std::string _name;
-- 
cgit v1.2.3