summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--configs/example/apu_se.py8
-rw-r--r--src/arch/hsail/insts/decl.hh2
-rw-r--r--src/arch/hsail/insts/mem_impl.hh16
-rw-r--r--src/arch/hsail/insts/pseudo_inst.cc6
-rw-r--r--src/gpu-compute/GPU.py2
-rw-r--r--src/gpu-compute/compute_unit.cc12
-rw-r--r--src/gpu-compute/global_memory_pipeline.cc124
-rw-r--r--src/gpu-compute/global_memory_pipeline.hh49
8 files changed, 173 insertions, 46 deletions
diff --git a/configs/example/apu_se.py b/configs/example/apu_se.py
index b8ec149d5..5ec3289d2 100644
--- a/configs/example/apu_se.py
+++ b/configs/example/apu_se.py
@@ -153,7 +153,9 @@ parser.add_option('--fast-forward-pseudo-op', action='store_true',
help = 'fast forward using kvm until the m5_switchcpu'
' pseudo-op is encountered, then switch cpus. subsequent'
' m5_switchcpu pseudo-ops will toggle back and forth')
-
+parser.add_option('--outOfOrderDataDelivery', action='store_true',
+ default=False, help='enable OoO data delivery in the GM'
+ ' pipeline')
Ruby.define_options(parser)
@@ -248,7 +250,9 @@ for i in xrange(n_cu):
localDataStore = \
LdsState(banks = options.numLdsBanks,
bankConflictPenalty = \
- options.ldsBankConflictPenalty)))
+ options.ldsBankConflictPenalty),
+ out_of_order_data_delivery =
+ options.outOfOrderDataDelivery))
wavefronts = []
vrfs = []
for j in xrange(options.simds_per_cu):
diff --git a/src/arch/hsail/insts/decl.hh b/src/arch/hsail/insts/decl.hh
index c40411ace..4c0bc9ce1 100644
--- a/src/arch/hsail/insts/decl.hh
+++ b/src/arch/hsail/insts/decl.hh
@@ -1082,7 +1082,7 @@ namespace HsailISA
gpuDynInst->useContinuation = false;
GlobalMemPipeline* gmp = &(w->computeUnit->globalMemoryPipe);
- gmp->getGMReqFIFO().push(gpuDynInst);
+ gmp->issueRequest(gpuDynInst);
w->wrGmReqsInPipe--;
w->rdGmReqsInPipe--;
diff --git a/src/arch/hsail/insts/mem_impl.hh b/src/arch/hsail/insts/mem_impl.hh
index c175f2782..dbda6643b 100644
--- a/src/arch/hsail/insts/mem_impl.hh
+++ b/src/arch/hsail/insts/mem_impl.hh
@@ -263,7 +263,7 @@ namespace HsailISA
}
}
- w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+ w->computeUnit->globalMemoryPipe.issueRequest(m);
w->outstandingReqsRdGm++;
w->rdGmReqsInPipe--;
break;
@@ -288,7 +288,7 @@ namespace HsailISA
}
}
- w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+ w->computeUnit->globalMemoryPipe.issueRequest(m);
w->outstandingReqsRdGm++;
w->rdGmReqsInPipe--;
break;
@@ -312,7 +312,7 @@ namespace HsailISA
}
}
- w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+ w->computeUnit->globalMemoryPipe.issueRequest(m);
w->outstandingReqsRdGm++;
w->rdGmReqsInPipe--;
break;
@@ -330,7 +330,7 @@ namespace HsailISA
}
}
}
- w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+ w->computeUnit->globalMemoryPipe.issueRequest(m);
w->outstandingReqsRdGm++;
w->rdGmReqsInPipe--;
break;
@@ -440,7 +440,7 @@ namespace HsailISA
}
}
- w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+ w->computeUnit->globalMemoryPipe.issueRequest(m);
w->outstandingReqsWrGm++;
w->wrGmReqsInPipe--;
break;
@@ -460,7 +460,7 @@ namespace HsailISA
}
}
- w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+ w->computeUnit->globalMemoryPipe.issueRequest(m);
w->outstandingReqsWrGm++;
w->wrGmReqsInPipe--;
break;
@@ -486,7 +486,7 @@ namespace HsailISA
}
}
- w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+ w->computeUnit->globalMemoryPipe.issueRequest(m);
w->outstandingReqsWrGm++;
w->wrGmReqsInPipe--;
break;
@@ -591,7 +591,7 @@ namespace HsailISA
m->latency.set(w->computeUnit->shader->ticks(64));
m->pipeId = GLBMEM_PIPE;
- w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+ w->computeUnit->globalMemoryPipe.issueRequest(m);
w->outstandingReqsWrGm++;
w->wrGmReqsInPipe--;
w->outstandingReqsRdGm++;
diff --git a/src/arch/hsail/insts/pseudo_inst.cc b/src/arch/hsail/insts/pseudo_inst.cc
index bfffb7d8f..580328aed 100644
--- a/src/arch/hsail/insts/pseudo_inst.cc
+++ b/src/arch/hsail/insts/pseudo_inst.cc
@@ -648,7 +648,7 @@ namespace HsailISA
m->pipeId = GLBMEM_PIPE;
m->latency.set(w->computeUnit->shader->ticks(64));
- w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+ w->computeUnit->globalMemoryPipe.issueRequest(m);
w->outstandingReqsWrGm++;
w->wrGmReqsInPipe--;
w->outstandingReqsRdGm++;
@@ -688,7 +688,7 @@ namespace HsailISA
m->pipeId = GLBMEM_PIPE;
m->latency.set(w->computeUnit->shader->ticks(64));
- w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+ w->computeUnit->globalMemoryPipe.issueRequest(m);
w->outstandingReqsWrGm++;
w->wrGmReqsInPipe--;
w->outstandingReqsRdGm++;
@@ -727,7 +727,7 @@ namespace HsailISA
m->pipeId = GLBMEM_PIPE;
m->latency.set(w->computeUnit->shader->ticks(1));
- w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+ w->computeUnit->globalMemoryPipe.issueRequest(m);
w->outstandingReqsRdGm++;
w->rdGmReqsInPipe--;
w->outstandingReqs++;
diff --git a/src/gpu-compute/GPU.py b/src/gpu-compute/GPU.py
index b672f616c..0cb9e76a4 100644
--- a/src/gpu-compute/GPU.py
+++ b/src/gpu-compute/GPU.py
@@ -135,6 +135,8 @@ class ComputeUnit(MemObject):
vector_register_file = VectorParam.VectorRegisterFile("Vector register "\
"file")
+ out_of_order_data_delivery = Param.Bool(False, "enable OoO data delivery"
+ " in the GM pipeline")
class Shader(ClockedObject):
type = 'Shader'
diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc
index 93cffbe1e..ffa5243d2 100644
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -1033,17 +1033,7 @@ ComputeUnit::DataPort::MemRespEvent::process()
if (gpuDynInst->n_reg > MAX_REGS_FOR_NON_VEC_MEM_INST)
gpuDynInst->statusVector.clear();
- if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) {
- assert(compute_unit->globalMemoryPipe.isGMLdRespFIFOWrRdy());
-
- compute_unit->globalMemoryPipe.getGMLdRespFIFO()
- .push(gpuDynInst);
- } else {
- assert(compute_unit->globalMemoryPipe.isGMStRespFIFOWrRdy());
-
- compute_unit->globalMemoryPipe.getGMStRespFIFO()
- .push(gpuDynInst);
- }
+ compute_unit->globalMemoryPipe.handleResponse(gpuDynInst);
DPRINTF(GPUMem, "CU%d: WF[%d][%d]: packet totally complete\n",
compute_unit->cu_id, gpuDynInst->simdId,
diff --git a/src/gpu-compute/global_memory_pipeline.cc b/src/gpu-compute/global_memory_pipeline.cc
index f48af5a6f..7583ebb9b 100644
--- a/src/gpu-compute/global_memory_pipeline.cc
+++ b/src/gpu-compute/global_memory_pipeline.cc
@@ -45,7 +45,8 @@
GlobalMemPipeline::GlobalMemPipeline(const ComputeUnitParams* p) :
computeUnit(nullptr), gmQueueSize(p->global_mem_queue_size),
- inflightStores(0), inflightLoads(0)
+ outOfOrderDataDelivery(p->out_of_order_data_delivery), inflightStores(0),
+ inflightLoads(0)
{
}
@@ -61,8 +62,7 @@ void
GlobalMemPipeline::exec()
{
// apply any returned global memory operations
- GPUDynInstPtr m = !gmReturnedLoads.empty() ? gmReturnedLoads.front() :
- !gmReturnedStores.empty() ? gmReturnedStores.front() : nullptr;
+ GPUDynInstPtr m = getNextReadyResp();
bool accessVrf = true;
Wavefront *w = nullptr;
@@ -74,30 +74,19 @@ GlobalMemPipeline::exec()
accessVrf =
w->computeUnit->vrf[w->simdId]->
- vrfOperandAccessReady(m->seqNum(), w, m,
- VrfAccessType::WRITE);
+ vrfOperandAccessReady(m->seqNum(), w, m, VrfAccessType::WRITE);
}
- if ((!gmReturnedStores.empty() || !gmReturnedLoads.empty()) &&
- m->latency.rdy() && computeUnit->glbMemToVrfBus.rdy() &&
+ if (m && m->latency.rdy() && computeUnit->glbMemToVrfBus.rdy() &&
accessVrf && m->statusBitVector == VectorMask(0) &&
(computeUnit->shader->coissue_return ||
- computeUnit->wfWait.at(m->pipeId).rdy())) {
+ computeUnit->wfWait.at(m->pipeId).rdy())) {
w = m->wavefront();
m->completeAcc(m);
- if (m->isLoad() || m->isAtomic()) {
- gmReturnedLoads.pop();
- assert(inflightLoads > 0);
- --inflightLoads;
- } else {
- assert(m->isStore());
- gmReturnedStores.pop();
- assert(inflightStores > 0);
- --inflightStores;
- }
+ completeRequest(m);
// Decrement outstanding register count
computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);
@@ -129,15 +118,30 @@ GlobalMemPipeline::exec()
} else {
++inflightLoads;
}
- } else {
+ } else if (mp->isStore()) {
if (inflightStores >= gmQueueSize) {
return;
- } else if (mp->isStore()) {
+ } else {
++inflightStores;
}
}
mp->initiateAcc(mp);
+
+ if (!outOfOrderDataDelivery && !mp->isMemFence()) {
+ /**
+ * if we are not in out-of-order data delivery mode
+ * then we keep the responses sorted in program order.
+ * in order to do so we must reserve an entry in the
+ * resp buffer before we issue the request to the mem
+ * system. mem fence requests will not be stored here
+ * because once they are issued from the GM pipeline,
+ * they do not send any response back to it.
+ */
+ gmOrderedRespBuffer.insert(std::make_pair(mp->seqNum(),
+ std::make_pair(mp, false)));
+ }
+
gmIssuedRequests.pop();
DPRINTF(GPUMem, "CU%d: WF[%d][%d] Popping 0 mem_op = \n",
@@ -145,6 +149,86 @@ GlobalMemPipeline::exec()
}
}
+GPUDynInstPtr
+GlobalMemPipeline::getNextReadyResp()
+{
+ if (outOfOrderDataDelivery) {
+ if (!gmReturnedLoads.empty()) {
+ return gmReturnedLoads.front();
+ } else if (!gmReturnedStores.empty()) {
+ return gmReturnedStores.front();
+ }
+ } else {
+ if (!gmOrderedRespBuffer.empty()) {
+ auto mem_req = gmOrderedRespBuffer.begin();
+
+ if (mem_req->second.second) {
+ return mem_req->second.first;
+ }
+ }
+ }
+
+ return nullptr;
+}
+
+void
+GlobalMemPipeline::completeRequest(GPUDynInstPtr gpuDynInst)
+{
+ if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) {
+ assert(inflightLoads > 0);
+ --inflightLoads;
+ } else if (gpuDynInst->isStore()) {
+ assert(inflightStores > 0);
+ --inflightStores;
+ }
+
+ if (outOfOrderDataDelivery) {
+ if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) {
+ assert(!gmReturnedLoads.empty());
+ gmReturnedLoads.pop();
+ } else if (gpuDynInst->isStore()) {
+ assert(!gmReturnedStores.empty());
+ gmReturnedStores.pop();
+ }
+ } else {
+ // we should only pop the oldest requst, and it
+ // should be marked as done if we are here
+ assert(gmOrderedRespBuffer.begin()->first == gpuDynInst->seqNum());
+ assert(gmOrderedRespBuffer.begin()->second.first == gpuDynInst);
+ assert(gmOrderedRespBuffer.begin()->second.second);
+ // remove this instruction from the buffer by its
+ // unique seq ID
+ gmOrderedRespBuffer.erase(gpuDynInst->seqNum());
+ }
+}
+
+void
+GlobalMemPipeline::issueRequest(GPUDynInstPtr gpuDynInst)
+{
+ gmIssuedRequests.push(gpuDynInst);
+}
+
+void
+GlobalMemPipeline::handleResponse(GPUDynInstPtr gpuDynInst)
+{
+ if (outOfOrderDataDelivery) {
+ if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) {
+ assert(isGMLdRespFIFOWrRdy());
+ gmReturnedLoads.push(gpuDynInst);
+ } else {
+ assert(isGMStRespFIFOWrRdy());
+ gmReturnedStores.push(gpuDynInst);
+ }
+ } else {
+ auto mem_req = gmOrderedRespBuffer.find(gpuDynInst->seqNum());
+ // if we are getting a response for this mem request,
+ // then it ought to already be in the ordered response
+ // buffer
+ assert(mem_req != gmOrderedRespBuffer.end());
+ mem_req->second.second = true;
+ }
+}
+
void
GlobalMemPipeline::regStats()
{
diff --git a/src/gpu-compute/global_memory_pipeline.hh b/src/gpu-compute/global_memory_pipeline.hh
index 368a15079..d10b7c1a2 100644
--- a/src/gpu-compute/global_memory_pipeline.hh
+++ b/src/gpu-compute/global_memory_pipeline.hh
@@ -62,10 +62,40 @@ class GlobalMemPipeline
void init(ComputeUnit *cu);
void exec();
- std::queue<GPUDynInstPtr> &getGMReqFIFO() { return gmIssuedRequests; }
std::queue<GPUDynInstPtr> &getGMStRespFIFO() { return gmReturnedStores; }
std::queue<GPUDynInstPtr> &getGMLdRespFIFO() { return gmReturnedLoads; }
+ /**
+ * find the next ready response to service. for OoO mode we
+ * simply pop the oldest (based on when the response was
+ * received) response in the response FIFOs. for in-order mode
+ * we pop the oldest (in program order) response, and only if
+ * it is marked as done.
+ */
+ GPUDynInstPtr getNextReadyResp();
+
+ /**
+ * once a memory request is finished we remove it from the
+ * buffer. this method determines which response buffer
+ * we're using based on the mode (in-order vs. OoO).
+ */
+ void completeRequest(GPUDynInstPtr gpuDynInst);
+
+ /**
+ * issues a request to the pipeline - i.e., enqueue it
+ * in the request buffer.
+ */
+ void issueRequest(GPUDynInstPtr gpuDynInst);
+
+ /**
+ * this method handles responses sent to this GM pipeline by the
+ * CU. in the case of in-order delivery it simply marks the reqeust
+ * as done in the ordered buffer to indicate that the requst is
+ * finished. for out-of-order data delivery, the requests are enqueued
+ * (in the order in which they are received) in the response FIFOs.
+ */
+ void handleResponse(GPUDynInstPtr gpuDynInst);
+
bool
isGMLdRespFIFOWrRdy() const
{
@@ -97,6 +127,7 @@ class GlobalMemPipeline
ComputeUnit *computeUnit;
std::string _name;
int gmQueueSize;
+ bool outOfOrderDataDelivery;
// number of cycles of delaying the update of a VGPR that is the
// target of a load instruction (or the load component of an atomic)
@@ -111,6 +142,22 @@ class GlobalMemPipeline
// The size of global memory.
int globalMemSize;
+ /*
+ * this buffer holds the memory responses when in-order data
+ * deilvery is used - the responses are ordered by their unique
+ * sequence number, which is monotonically increasing. when a
+ * memory request returns its "done" flag is set to true. during
+ * each tick the the GM pipeline will check if the oldest request
+ * is finished, and if so it will be removed from the queue.
+ *
+ * key: memory instruction's sequence ID
+ *
+ * value: pair holding the instruction pointer and a bool that
+ * is used to indicate whether or not the request has
+ * completed
+ */
+ std::map<uint64_t, std::pair<GPUDynInstPtr, bool>> gmOrderedRespBuffer;
+
// Global Memory Request FIFO: all global memory requests
// are issued to this FIFO from the memory pipelines
std::queue<GPUDynInstPtr> gmIssuedRequests;