From 95735e10e7ea85320ee39c15a4132eece8417af4 Mon Sep 17 00:00:00 2001
From: "Mitch Hayenga ext:(%2C%20Amin%20Farmahini%20%3Caminfar%40gmail.com%3E)"
 <mitch.hayenga+gem5@gmail.com>
Date: Wed, 29 Jan 2014 23:21:25 -0600
Subject: mem: prefetcher: add options, support for unaligned addresses

This patch extends the classic prefetcher to work on non-block aligned
addresses.  Because the existing prefetchers in gem5 mask off the lower
address bits of cache accesses, many predictable strides fail to be
detected.  For example, if a load were to stride by 48 bytes, with 64 byte
cachelines, the current stride based prefetcher would see an access pattern
of 0, 64, 64, 128, 192.... Thus not detecting a constant stride pattern.  This
patch fixes this, by training the prefetcher on access and not masking off the
lower address bits.

It also adds the following configuration options:
1) Training/prefetching only on cache misses,
2) Training/prefetching only on data acceses,
3) Optionally tagging prefetches with a PC address.
#3 allows prefetchers to train off of prefetch requests in systems with
multiple cache levels and PC-based prefetchers present at multiple levels.
It also effectively allows a pipelining of prefetch requests (like in POWER4)
across multiple levels of cache hierarchy.

Improves performance on my gem5 configuration by 4.3% for SPECINT and 4.7%  for SPECFP (geomean).
---
 src/mem/cache/cache_impl.hh          | 11 +++++++++++
 src/mem/cache/prefetch/Prefetcher.py |  6 ++++++
 src/mem/cache/prefetch/base.cc       | 18 ++++++++++++++++--
 src/mem/cache/prefetch/base.hh       | 20 +++++++++++++++-----
 src/mem/cache/prefetch/stride.cc     | 18 +++++++++---------
 src/mem/request.hh                   |  7 +++++++
 6 files changed, 64 insertions(+), 16 deletions(-)
diff --git a/src/mem/cache/cache_impl.hh b/src/mem/cache/cache_impl.hh
index b26473336..76fb697c2 100644
--- a/src/mem/cache/cache_impl.hh
+++ b/src/mem/cache/cache_impl.hh
@@ -556,6 +556,17 @@ Cache<TagStore>::recvTimingReq(PacketPtr pkt)
                 // move it ahead of mshrs that are ready
                 // mshrQueue.moveToFront(mshr);
             }
+
+            // We should call the prefetcher reguardless if the request is
+            // satisfied or not, reguardless if the request is in the MSHR or
+            // not.  The request could be a ReadReq hit, but still not
+            // satisfied (potentially because of a prior write to the same
+            // cache line.  So, even when not satisfied, tehre is an MSHR
+            // already allocated for this, we need to let the prefetcher know
+            // about the request
+            if (prefetcher) {
+                next_pf_time = prefetcher->notify(pkt, time);
+            }
         } else {
             // no MSHR
             assert(pkt->req->masterId() < system->maxMasters());
diff --git a/src/mem/cache/prefetch/Prefetcher.py b/src/mem/cache/prefetch/Prefetcher.py
index af67f40b6..7d7aeed32 100644
--- a/src/mem/cache/prefetch/Prefetcher.py
+++ b/src/mem/cache/prefetch/Prefetcher.py
@@ -59,6 +59,12 @@ class BasePrefetcher(ClockedObject):
          "Use the master id to separate calculations of prefetches")
     data_accesses_only = Param.Bool(False,
          "Only prefetch on data not on instruction accesses")
+    on_miss_only = Param.Bool(False,
+         "Only prefetch on miss (as opposed to always)")
+    on_read_only = Param.Bool(False,
+         "Only prefetch on read requests (write requests ignored)")
+    on_prefetch = Param.Bool(True,
+         "Let lower cache prefetcher train on prefetch requests")
     sys = Param.System(Parent.any, "System this device belongs to")
 
 class GHBPrefetcher(BasePrefetcher):
diff --git a/src/mem/cache/prefetch/base.cc b/src/mem/cache/prefetch/base.cc
index c440978e6..d5cddc88e 100644
--- a/src/mem/cache/prefetch/base.cc
+++ b/src/mem/cache/prefetch/base.cc
@@ -60,7 +60,9 @@ BasePrefetcher::BasePrefetcher(const Params *p)
     : ClockedObject(p), size(p->size), latency(p->latency), degree(p->degree),
       useMasterId(p->use_master_id), pageStop(!p->cross_pages),
       serialSquash(p->serial_squash), onlyData(p->data_accesses_only),
-      system(p->sys), masterId(system->getMasterId(name()))
+      onMissOnly(p->on_miss_only), onReadOnly(p->on_read_only),
+      onPrefetch(p->on_prefetch), system(p->sys),
+      masterId(system->getMasterId(name()))
 {
 }
 
@@ -185,7 +187,14 @@ BasePrefetcher::getPacket()
 Tick
 BasePrefetcher::notify(PacketPtr &pkt, Tick tick)
 {
-    if (!pkt->req->isUncacheable() && !(pkt->req->isInstFetch() && onlyData)) {
+    // Don't consult the prefetcher if any of the following conditons are true
+    // 1) The request is uncacheable
+    // 2) The request is a fetch, but we are only prefeching data
+    // 3) The request is a cache hit, but we are only training on misses
+    // 4) THe request is a write, but we are only training on reads
+    if (!pkt->req->isUncacheable() && !(pkt->req->isInstFetch() && onlyData) &&
+        !(onMissOnly && inCache(pkt->getAddr(), true)) &&
+        !(onReadOnly && !pkt->isRead())) {
         // Calculate the blk address
         Addr blk_addr = pkt->getAddr() & ~(Addr)(blkSize-1);
         bool is_secure = pkt->isSecure();
@@ -262,6 +271,11 @@ BasePrefetcher::notify(PacketPtr &pkt, Tick tick)
             prefetch->req->setThreadContext(pkt->req->contextId(),
                                             pkt->req->threadId());
 
+            // Tag orefetch reqeuests with corresponding PC to train lower
+            // cache-level prefetchers
+            if (onPrefetch && pkt->req->hasPC())
+                prefetch->req->setPC(pkt->req->getPC());
+
             // We just remove the head if we are full
             if (pf.size() == size) {
                 pfRemovedFull++;
diff --git a/src/mem/cache/prefetch/base.hh b/src/mem/cache/prefetch/base.hh
index 953852c38..fc0dd0b36 100644
--- a/src/mem/cache/prefetch/base.hh
+++ b/src/mem/cache/prefetch/base.hh
@@ -89,18 +89,28 @@ class BasePrefetcher : public ClockedObject
     const Cycles latency;
 
     /** The number of prefetches to issue */
-    unsigned degree;
+    const unsigned degree;
 
     /** If patterns should be found per context id */
-    bool useMasterId;
+    const bool useMasterId;
     /** Do we prefetch across page boundaries. */
-    bool pageStop;
+    const bool pageStop;
 
     /** Do we remove prefetches with later times than a new miss.*/
-    bool serialSquash;
+    const bool serialSquash;
 
     /** Do we prefetch on only data reads, or on inst reads as well. */
-    bool onlyData;
+    const bool onlyData;
+
+    /** Do we trigger/train prefetch on cache misses only, or all accesses. */
+    const bool onMissOnly;
+
+    /** Do we trigger/train prefetch on reads only, or all accesses. */
+    const bool onReadOnly;
+
+    /** Do we tag prefetch's with PC addresses, allowing lower pc-based
+        prefetchers to prefetch on prefetch requests */
+    const bool onPrefetch;
 
     /** System we belong to */
     System* system;
diff --git a/src/mem/cache/prefetch/stride.cc b/src/mem/cache/prefetch/stride.cc
index fd8b20fcc..a7abf4809 100644
--- a/src/mem/cache/prefetch/stride.cc
+++ b/src/mem/cache/prefetch/stride.cc
@@ -59,7 +59,7 @@ StridePrefetcher::calculatePrefetch(PacketPtr &pkt, std::list<Addr> &addresses,
         return;
     }
 
-    Addr blk_addr = pkt->getAddr() & ~(Addr)(blkSize-1);
+    Addr data_addr = pkt->getAddr();
     bool is_secure = pkt->isSecure();
     MasterID master_id = useMasterId ? pkt->req->masterId() : 0;
     Addr pc = pkt->req->getPC();
@@ -77,7 +77,7 @@ StridePrefetcher::calculatePrefetch(PacketPtr &pkt, std::list<Addr> &addresses,
     if (iter != tab.end()) {
         // Hit in table
 
-        int new_stride = blk_addr - (*iter)->missAddr;
+        int new_stride = data_addr - (*iter)->missAddr;
         bool stride_match = (new_stride == (*iter)->stride);
 
         if (stride_match && new_stride != 0) {
@@ -89,20 +89,20 @@ StridePrefetcher::calculatePrefetch(PacketPtr &pkt, std::list<Addr> &addresses,
                 (*iter)->confidence = 0;
         }
 
-        DPRINTF(HWPrefetch, "hit: PC %x blk_addr %x (%s) stride %d (%s), "
-                "conf %d\n", pc, blk_addr, is_secure ? "s" : "ns", new_stride,
+        DPRINTF(HWPrefetch, "hit: PC %x data_addr %x (%s) stride %d (%s), "
+                "conf %d\n", pc, data_addr, is_secure ? "s" : "ns", new_stride,
                 stride_match ? "match" : "change",
                 (*iter)->confidence);
 
-        (*iter)->missAddr = blk_addr;
+        (*iter)->missAddr = data_addr;
         (*iter)->isSecure = is_secure;
 
         if ((*iter)->confidence <= 0)
             return;
 
         for (int d = 1; d <= degree; d++) {
-            Addr new_addr = blk_addr + d * new_stride;
-            if (pageStop && !samePage(blk_addr, new_addr)) {
+            Addr new_addr = data_addr + d * new_stride;
+            if (pageStop && !samePage(data_addr, new_addr)) {
                 // Spanned the page, so now stop
                 pfSpanPage += degree - d + 1;
                 return;
@@ -117,7 +117,7 @@ StridePrefetcher::calculatePrefetch(PacketPtr &pkt, std::list<Addr> &addresses,
         // Miss in table
         // Find lowest confidence and replace
 
-        DPRINTF(HWPrefetch, "miss: PC %x blk_addr %x (%s)\n", pc, blk_addr,
+        DPRINTF(HWPrefetch, "miss: PC %x data_addr %x (%s)\n", pc, data_addr,
                 is_secure ? "s" : "ns");
 
         if (tab.size() >= 256) { //set default table size is 256
@@ -139,7 +139,7 @@ StridePrefetcher::calculatePrefetch(PacketPtr &pkt, std::list<Addr> &addresses,
 
         StrideEntry *new_entry = new StrideEntry;
         new_entry->instAddr = pc;
-        new_entry->missAddr = blk_addr;
+        new_entry->missAddr = data_addr;
         new_entry->isSecure = is_secure;
         new_entry->stride = 0;
         new_entry->confidence = 0;
diff --git a/src/mem/request.hh b/src/mem/request.hh
index 28d378628..e84a77272 100644
--- a/src/mem/request.hh
+++ b/src/mem/request.hh
@@ -576,6 +576,13 @@ class Request
         return _threadId;
     }
 
+    void
+    setPC(Addr pc)
+    {
+        privateFlags.set(VALID_PC);
+        _pc = pc;
+    }
+
     bool
     hasPC() const
     {
-- 
cgit v1.2.3