hsail, gpu-compute: remove doGm/SmReturn add completeAcc

we are removing doGmReturn from the GM pipe, and adding completeAcc() implementations for the HSAIL mem ops. the behavior in doGmReturn is dependent on HSAIL and HSAIL mem ops, however the completion phase of memory ops in machine ISA can be very different, even amongst individual machine ISA mem ops. so we remove this functionality from the pipeline and allow it to be implemented by the individual instructions.
author: Tony Gutierrez <anthony.gutierrez@amd.com> 2016-10-26 22:47:19 -0400
committer: Tony Gutierrez <anthony.gutierrez@amd.com> 2016-10-26 22:47:19 -0400
commit: 00a6346c91f6e09eb9c0f4cf61a7d72932a1282f (patch)
tree: 3ca7c5b2dedd8ed8151566b4d5abec95a06f4faf /src/arch/hsail/insts/mem.hh
parent: 7ac38849abaf6aeccf39137bc8acb9e44d192e82 (diff)
download: gem5-00a6346c91f6e09eb9c0f4cf61a7d72932a1282f.tar.xz
1 files changed, 140 insertions, 0 deletions
diff --git a/src/arch/hsail/insts/mem.hh b/src/arch/hsail/insts/mem.hh
index e223c7cf5..2e7dfcd1c 100644
--- a/src/arch/hsail/insts/mem.hh
+++ b/src/arch/hsail/insts/mem.hh
@@ -36,9 +36,12 @@
 #ifndef __ARCH_HSAIL_INSTS_MEM_HH__
 #define __ARCH_HSAIL_INSTS_MEM_HH__
 
+#include <type_traits>
+
 #include "arch/hsail/insts/decl.hh"
 #include "arch/hsail/insts/gpu_static_inst.hh"
 #include "arch/hsail/operand.hh"
+#include "gpu-compute/compute_unit.hh"
 
 namespace HsailISA
 {
@@ -491,6 +494,86 @@ namespace HsailISA
             gpuDynInst->updateStats();
         }
 
+        void
+        completeAcc(GPUDynInstPtr gpuDynInst) override
+        {
+            typedef typename MemDataType::CType c1;
+
+            constexpr bool is_vt_32 = DestDataType::vgprType == VT_32;
+
+            /**
+              * this code essentially replaces the long if-else chain
+              * that was in used GlobalMemPipeline::exec() to infer the
+              * size (single/double) and type (floating point/integer) of
+              * the destination register. this is needed for load
+              * instructions because the loaded value and the
+              * destination type can be of different sizes, and we also
+              * need to know if the value we're writing back is floating
+              * point and signed/unsigned, so we can properly cast the
+              * writeback value
+              */
+            typedef typename std::conditional<is_vt_32,
+                typename std::conditional<std::is_floating_point<c1>::value,
+                    float, typename std::conditional<std::is_signed<c1>::value,
+                    int32_t, uint32_t>::type>::type,
+                typename std::conditional<std::is_floating_point<c1>::value,
+                    double, typename std::conditional<std::is_signed<c1>::value,
+                    int64_t, uint64_t>::type>::type>::type c0;
+
+
+            Wavefront *w = gpuDynInst->wavefront();
+
+            std::vector<uint32_t> regVec;
+            // iterate over number of destination register operands since
+            // this is a load
+            for (int k = 0; k < num_dest_operands; ++k) {
+                assert((sizeof(c1) * num_dest_operands)
+                       <= MAX_WIDTH_FOR_MEM_INST);
+
+                int dst = this->dest.regIndex() + k;
+                if (num_dest_operands > MAX_REGS_FOR_NON_VEC_MEM_INST)
+                    dst = dest_vect[k].regIndex();
+                // virtual->physical VGPR mapping
+                int physVgpr = w->remap(dst, sizeof(c0), 1);
+                // save the physical VGPR index
+                regVec.push_back(physVgpr);
+
+                c1 *p1 =
+                    &((c1*)gpuDynInst->d_data)[k * w->computeUnit->wfSize()];
+
+                for (int i = 0; i < w->computeUnit->wfSize(); ++i) {
+                    if (gpuDynInst->exec_mask[i]) {
+                        DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: "
+                                "$%s%d <- %d global ld done (src = wavefront "
+                                "ld inst)\n", w->computeUnit->cu_id, w->simdId,
+                                w->wfSlotId, i, sizeof(c0) == 4 ? "s" : "d",
+                                dst, *p1);
+                        // write the value into the physical VGPR. This is a
+                        // purely functional operation. No timing is modeled.
+                        w->computeUnit->vrf[w->simdId]->write<c0>(physVgpr,
+                                                                    *p1, i);
+                    }
+                    ++p1;
+                }
+            }
+
+            // Schedule the write operation of the load data on the VRF.
+            // This simply models the timing aspect of the VRF write operation.
+            // It does not modify the physical VGPR.
+            int loadVrfBankConflictCycles = gpuDynInst->computeUnit()->
+                vrf[w->simdId]->exec(gpuDynInst->seqNum(), w, regVec,
+                                     sizeof(c0), gpuDynInst->time);
+
+            if (this->isGlobalMem()) {
+                gpuDynInst->computeUnit()->globalMemoryPipe
+                    .incLoadVRFBankConflictCycles(loadVrfBankConflictCycles);
+            } else {
+                assert(this->isLocalMem());
+                gpuDynInst->computeUnit()->localMemoryPipe
+                    .incLoadVRFBankConflictCycles(loadVrfBankConflictCycles);
+            }
+        }
+
       private:
         void
         execLdAcq(GPUDynInstPtr gpuDynInst) override
@@ -941,6 +1024,11 @@ namespace HsailISA
             execSt(gpuDynInst);
         }
 
+        // stores don't write anything back, so there is nothing
+        // to do here. we only override this method to avoid the
+        // fatal in the base class implementation
+        void completeAcc(GPUDynInstPtr gpuDynInst) override { }
+
       private:
         // execSt may be called through a continuation
         // if the store had release semantics. see comment for
@@ -1409,6 +1497,58 @@ namespace HsailISA
 
         }
 
+        void
+        completeAcc(GPUDynInstPtr gpuDynInst) override
+        {
+            // if this is not an atomic return op, then we
+            // have nothing more to do.
+            if (this->isAtomicRet()) {
+                // the size of the src operands and the
+                // memory being operated on must match
+                // for HSAIL atomics - this assumption may
+                // not apply to all ISAs
+                typedef typename MemDataType::CType CType;
+
+                Wavefront *w = gpuDynInst->wavefront();
+                int dst = this->dest.regIndex();
+                std::vector<uint32_t> regVec;
+                // virtual->physical VGPR mapping
+                int physVgpr = w->remap(dst, sizeof(CType), 1);
+                regVec.push_back(physVgpr);
+                CType *p1 = &((CType*)gpuDynInst->d_data)[0];
+
+                for (int i = 0; i < w->computeUnit->wfSize(); ++i) {
+                    if (gpuDynInst->exec_mask[i]) {
+                        DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: "
+                                "$%s%d <- %d global ld done (src = wavefront "
+                                "ld inst)\n", w->computeUnit->cu_id, w->simdId,
+                                w->wfSlotId, i, sizeof(CType) == 4 ? "s" : "d",
+                                dst, *p1);
+                        // write the value into the physical VGPR. This is a
+                        // purely functional operation. No timing is modeled.
+                        w->computeUnit->vrf[w->simdId]->write<CType>(physVgpr, *p1, i);
+                    }
+                    ++p1;
+                }
+
+                // Schedule the write operation of the load data on the VRF.
+                // This simply models the timing aspect of the VRF write operation.
+                // It does not modify the physical VGPR.
+                int loadVrfBankConflictCycles = gpuDynInst->computeUnit()->
+                    vrf[w->simdId]->exec(gpuDynInst->seqNum(), w, regVec,
+                                         sizeof(CType), gpuDynInst->time);
+
+                if (this->isGlobalMem()) {
+                    gpuDynInst->computeUnit()->globalMemoryPipe
+                        .incLoadVRFBankConflictCycles(loadVrfBankConflictCycles);
+                } else {
+                    assert(this->isLocalMem());
+                    gpuDynInst->computeUnit()->localMemoryPipe
+                        .incLoadVRFBankConflictCycles(loadVrfBankConflictCycles);
+                }
+            }
+        }
+
         void execute(GPUDynInstPtr gpuDynInst) override;
 
       private:
author	Tony Gutierrez <anthony.gutierrez@amd.com>	2016-10-26 22:47:19 -0400
committer	Tony Gutierrez <anthony.gutierrez@amd.com>	2016-10-26 22:47:19 -0400
commit	00a6346c91f6e09eb9c0f4cf61a7d72932a1282f (patch)
tree	3ca7c5b2dedd8ed8151566b4d5abec95a06f4faf /src/arch/hsail/insts/mem.hh
parent	7ac38849abaf6aeccf39137bc8acb9e44d192e82 (diff)
download	gem5-00a6346c91f6e09eb9c0f4cf61a7d72932a1282f.tar.xz