cpu: Add a fetch queue to the o3 cpu

This patch adds a fetch queue that sits between fetch and decode to the o3 cpu. This effectively decouples fetch from decode stalls allowing it to be more aggressive, running futher ahead in the instruction stream.
author: Mitch Hayenga <mitch.hayenga@arm.com> 2014-09-03 07:42:35 -0400
committer: Mitch Hayenga <mitch.hayenga@arm.com> 2014-09-03 07:42:35 -0400
commit: ecd53009712da59a98ad3c13ed20aaa8e8cd7e29 (patch)
tree: b23bf72ad68c2dce684d35ff1e6dda6c4a573150
parent: 1716749c8cec6f9c9f10a0aeaff981be759bb4e5 (diff)
download: gem5-ecd53009712da59a98ad3c13ed20aaa8e8cd7e29.tar.xz
3 files changed, 55 insertions, 21 deletions
diff --git a/src/cpu/o3/O3CPU.py b/src/cpu/o3/O3CPU.py
index fb5b5de2b..c70a12f1d 100644
--- a/src/cpu/o3/O3CPU.py
+++ b/src/cpu/o3/O3CPU.py
@@ -61,6 +61,7 @@ class DerivO3CPU(BaseCPU):
     commitToFetchDelay = Param.Cycles(1, "Commit to fetch delay")
     fetchWidth = Param.Unsigned(8, "Fetch width")
     fetchBufferSize = Param.Unsigned(64, "Fetch buffer size in bytes")
+    fetchQueueSize = Param.Unsigned(32, "Fetch queue size in micro-ops")
 
     renameToDecodeDelay = Param.Cycles(1, "Rename to decode delay")
     iewToDecodeDelay = Param.Cycles(1, "Issue/Execute/Writeback to decode "
diff --git a/src/cpu/o3/fetch.hh b/src/cpu/o3/fetch.hh
index 0c1b81d86..2e9428ef1 100644
--- a/src/cpu/o3/fetch.hh
+++ b/src/cpu/o3/fetch.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2010-2012 ARM Limited
+ * Copyright (c) 2010-2012, 2014 ARM Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -401,9 +401,6 @@ class DefaultFetch
     /** Wire to get commit's information from backwards time buffer. */
     typename TimeBuffer<TimeStruct>::wire fromCommit;
 
-    /** Internal fetch instruction queue. */
-    TimeBuffer<FetchStruct> *fetchQueue;
-
     //Might be annoying how this name is different than the queue.
     /** Wire used to write any information heading to decode. */
     typename TimeBuffer<FetchStruct>::wire toDecode;
@@ -455,6 +452,9 @@ class DefaultFetch
     /** The width of fetch in instructions. */
     unsigned fetchWidth;
 
+    /** The width of decode in instructions. */
+    unsigned decodeWidth;
+
     /** Is the cache blocked?  If so no threads can access it. */
     bool cacheBlocked;
 
@@ -481,6 +481,12 @@ class DefaultFetch
     /** The PC of the first instruction loaded into the fetch buffer. */
     Addr fetchBufferPC[Impl::MaxThreads];
 
+    /** The size of the fetch queue in micro-ops */
+    unsigned fetchQueueSize;
+
+    /** Queue of fetched instructions */
+    std::deque<DynInstPtr> fetchQueue;
+
     /** Whether or not the fetch buffer data is valid. */
     bool fetchBufferValid[Impl::MaxThreads];
 
diff --git a/src/cpu/o3/fetch_impl.hh b/src/cpu/o3/fetch_impl.hh
index 637e39957..219444ace 100644
--- a/src/cpu/o3/fetch_impl.hh
+++ b/src/cpu/o3/fetch_impl.hh
@@ -82,11 +82,13 @@ DefaultFetch<Impl>::DefaultFetch(O3CPU *_cpu, DerivO3CPUParams *params)
       iewToFetchDelay(params->iewToFetchDelay),
       commitToFetchDelay(params->commitToFetchDelay),
       fetchWidth(params->fetchWidth),
+      decodeWidth(params->decodeWidth),
       retryPkt(NULL),
       retryTid(InvalidThreadID),
       cacheBlkSize(cpu->cacheLineSize()),
       fetchBufferSize(params->fetchBufferSize),
       fetchBufferMask(fetchBufferSize - 1),
+      fetchQueueSize(params->fetchQueueSize),
       numThreads(params->numThreads),
       numFetchingThreads(params->smtNumFetchingThreads),
       finishTranslationEvent(this)
@@ -313,12 +315,10 @@ DefaultFetch<Impl>::setActiveThreads(std::list<ThreadID> *at_ptr)
 
 template<class Impl>
 void
-DefaultFetch<Impl>::setFetchQueue(TimeBuffer<FetchStruct> *fq_ptr)
+DefaultFetch<Impl>::setFetchQueue(TimeBuffer<FetchStruct> *ftb_ptr)
 {
-    fetchQueue = fq_ptr;
-
-    // Create wire to write information to proper place in fetch queue.
-    toDecode = fetchQueue->getWire(0);
+    // Create wire to write information to proper place in fetch time buf.
+    toDecode = ftb_ptr->getWire(0);
 }
 
 template<class Impl>
@@ -342,6 +342,7 @@ DefaultFetch<Impl>::resetStage()
     cacheBlocked = false;
 
     priorityList.clear();
+    fetchQueue.clear();
 
     // Setup PC and nextPC with initial state.
     for (ThreadID tid = 0; tid < numThreads; ++tid) {
@@ -454,6 +455,10 @@ DefaultFetch<Impl>::isDrained() const
             return false;
     }
 
+    // Not drained if fetch queue contains entries
+    if (!fetchQueue.empty())
+        return false;
+
     /* The pipeline might start up again in the middle of the drain
      * cycle if the finish translation event is scheduled, so make
      * sure that's not the case.
@@ -673,11 +678,8 @@ DefaultFetch<Impl>::finishTranslation(Fault fault, RequestPtr mem_req)
             fetchStatus[tid] = IcacheWaitResponse;
         }
     } else {
-        // Don't send an instruction to decode if it can't handle it.
-        // Asynchronous nature of this function's calling means we have to
-        // check 2 signals to see if decode is stalled.
-        if (!(numInst < fetchWidth) || stalls[tid].decode ||
-            fromDecode->decodeBlock[tid]) {
+        // Don't send an instruction to decode if we can't handle it.
+        if (!(numInst < fetchWidth) || !(fetchQueue.size() < fetchQueueSize)) {
             assert(!finishTranslationEvent.scheduled());
             finishTranslationEvent.setFault(fault);
             finishTranslationEvent.setReq(mem_req);
@@ -758,6 +760,15 @@ DefaultFetch<Impl>::doSquash(const TheISA::PCState &newPC,
 
     fetchStatus[tid] = Squashing;
 
+    // Empty fetch queue
+    auto inst_itr = fetchQueue.begin();
+    while (inst_itr != fetchQueue.end()) {
+        if ((*inst_itr)->threadNumber == tid)
+            inst_itr = fetchQueue.erase(inst_itr);
+         else
+            ++inst_itr;
+    }
+
     // microops are being squashed, it is not known wheather the
     // youngest non-squashed microop was  marked delayed commit
     // or not. Setting the flag to true ensures that the
@@ -796,9 +807,6 @@ DefaultFetch<Impl>::checkStall(ThreadID tid) const
         assert(cpu->isDraining());
         DPRINTF(Fetch,"[tid:%i]: Drain stall detected.\n",tid);
         ret_val = true;
-    } else if (stalls[tid].decode) {
-        DPRINTF(Fetch,"[tid:%i]: Stall from Decode stage detected.\n",tid);
-        ret_val = true;
     }
 
     return ret_val;
@@ -921,6 +929,21 @@ DefaultFetch<Impl>::tick()
         }
     }
 
+    // Send instructions enqueued into the fetch queue to decode.
+    // Limit rate by fetchWidth.  Stall if decode is stalled.
+    unsigned instsToDecode = 0;
+    while(!fetchQueue.empty() &&
+          instsToDecode < decodeWidth &&
+          !stalls[fetchQueue.front()->threadNumber].decode) {
+        auto inst = fetchQueue.front();
+        toDecode->insts[toDecode->size++] = inst;
+        DPRINTF(Fetch, "[tid:%i][sn:%i]: Sending instruction to decode from "
+                "fetch queue. Fetch queue size: %i.\n",
+                inst->threadNumber, inst->seqNum, fetchQueue.size());
+        fetchQueue.pop_front();
+        instsToDecode++;
+    }
+
     // Reset the number of the instruction we've fetched.
     numInst = 0;
 }
@@ -1072,7 +1095,11 @@ DefaultFetch<Impl>::buildInst(ThreadID tid, StaticInstPtr staticInst,
     // Write the instruction to the first slot in the queue
     // that heads to decode.
     assert(numInst < fetchWidth);
-    toDecode->insts[toDecode->size++] = instruction;
+    fetchQueue.push_back(instruction);
+    assert(fetchQueue.size() <= fetchQueueSize);
+    DPRINTF(Fetch, "[tid:%i]: Fetch queue entry created (%i/%i).\n",
+            tid, fetchQueue.size(), fetchQueueSize);
+    //toDecode->insts[toDecode->size++] = instruction;
 
     // Keep track of if we can take an interrupt at this boundary
     delayedCommit[tid] = instruction->isDelayedCommit();
@@ -1186,8 +1213,8 @@ DefaultFetch<Impl>::fetch(bool &status_change)
     // Loop through instruction memory from the cache.
     // Keep issuing while fetchWidth is available and branch is not
     // predicted taken
-    while (numInst < fetchWidth && !predictedBranch) {
-
+    while (numInst < fetchWidth && fetchQueue.size() < fetchQueueSize
+           && !predictedBranch) {
         // We need to process more memory if we aren't going to get a
         // StaticInst from the rom, the current macroop, or what's already
         // in the decoder.
@@ -1310,7 +1337,7 @@ DefaultFetch<Impl>::fetch(bool &status_change)
                 break;
             }
         } while ((curMacroop || decoder[tid]->instReady()) &&
-                 numInst < fetchWidth);
+                 numInst < fetchWidth && fetchQueue.size() < fetchQueueSize);
     }
 
     if (predictedBranch) {
author	Mitch Hayenga <mitch.hayenga@arm.com>	2014-09-03 07:42:35 -0400
committer	Mitch Hayenga <mitch.hayenga@arm.com>	2014-09-03 07:42:35 -0400
commit	ecd53009712da59a98ad3c13ed20aaa8e8cd7e29 (patch)
tree	b23bf72ad68c2dce684d35ff1e6dda6c4a573150
parent	1716749c8cec6f9c9f10a0aeaff981be759bb4e5 (diff)
download	gem5-ecd53009712da59a98ad3c13ed20aaa8e8cd7e29.tar.xz