4 files changed, 255 insertions, 6 deletions
diff --git a/src/mem/cache/Cache.py b/src/mem/cache/Cache.py
index 230131bdc..8ffab911b 100644
--- a/src/mem/cache/Cache.py
+++ b/src/mem/cache/Cache.py
@@ -41,6 +41,7 @@
 
 from m5.params import *
 from m5.proxy import *
+from m5.SimObject import SimObject
 from MemObject import MemObject
 from Prefetcher import BasePrefetcher
 from ReplacementPolicies import *
@@ -51,6 +52,24 @@ from Tags import *
 # exclusive.
 class Clusivity(Enum): vals = ['mostly_incl', 'mostly_excl']
 
+class WriteAllocator(SimObject):
+    type = 'WriteAllocator'
+    cxx_header = "mem/cache/cache.hh"
+
+    # Control the limits for when the cache introduces extra delays to
+    # allow whole-line write coalescing, and eventually switches to a
+    # write-no-allocate policy.
+    coalesce_limit = Param.Unsigned(2, "Consecutive lines written before "
+                                    "delaying for coalescing")
+    no_allocate_limit = Param.Unsigned(12, "Consecutive lines written before"
+                                       " skipping allocation")
+
+    delay_threshold = Param.Unsigned(8, "Number of delay quanta imposed on an "
+                                     "MSHR with write requests to allow for "
+                                     "write coalescing")
+
+    block_size = Param.Int(Parent.cache_line_size, "block size in bytes")
+
 
 class BaseCache(MemObject):
     type = 'BaseCache'
@@ -116,6 +135,11 @@ class BaseCache(MemObject):
     clusivity = Param.Clusivity('mostly_incl',
                                 "Clusivity with upstream cache")
 
+    # The write allocator enables optimizations for streaming write
+    # accesses by first coalescing writes and then avoiding allocation
+    # in the current cache. Typically, this would be enabled in the
+    # data cache.
+    write_allocator = Param.WriteAllocator(NULL, "Write allocator")
 
 class Cache(BaseCache):
     type = 'Cache'
diff --git a/src/mem/cache/base.cc b/src/mem/cache/base.cc
index b292e5a25..183d93f14 100644
--- a/src/mem/cache/base.cc
+++ b/src/mem/cache/base.cc
@@ -58,6 +58,7 @@
 #include "mem/cache/prefetch/base.hh"
 #include "mem/cache/queue_entry.hh"
 #include "params/BaseCache.hh"
+#include "params/WriteAllocator.hh"
 #include "sim/core.hh"
 
 class BaseMasterPort;
@@ -83,6 +84,7 @@ BaseCache::BaseCache(const BaseCacheParams *p, unsigned blk_size)
       tags(p->tags),
       prefetcher(p->prefetcher),
       prefetchOnAccess(p->prefetch_on_access),
+      writeAllocator(p->write_allocator),
       writebackClean(p->writeback_clean),
       tempBlockWriteback(nullptr),
       writebackTempBlockAtomicEvent([this]{ writebackTempBlockAtomic(); },
@@ -243,6 +245,12 @@ void
 BaseCache::handleTimingReqMiss(PacketPtr pkt, MSHR *mshr, CacheBlk *blk,
                                Tick forward_time, Tick request_time)
 {
+    if (writeAllocator &&
+        pkt && pkt->isWrite() && !pkt->req->isUncacheable()) {
+        writeAllocator->updateMode(pkt->getAddr(), pkt->getSize(),
+                                   pkt->getBlockAddr(blkSize));
+    }
+
     if (mshr) {
         /// MSHR hit
         /// @note writebacks will be checked in getNextMSHR()
@@ -391,11 +399,13 @@ BaseCache::recvTimingReq(PacketPtr pkt)
         // already allocated for this, we need to let the prefetcher
         // know about the request
 
-        // Don't notify prefetcher on SWPrefetch or cache maintenance
-        // operations
+        // Don't notify prefetcher on SWPrefetch, cache maintenance
+        // operations or for writes that we are coaslescing.
         if (prefetcher && pkt &&
             !pkt->cmd.isSWPrefetch() &&
-            !pkt->req->isCacheMaintenance()) {
+            !pkt->req->isCacheMaintenance() &&
+            !(writeAllocator && writeAllocator->coalesce() &&
+              pkt->isWrite())) {
             next_pf_time = prefetcher->notify(pkt);
         }
     }
@@ -487,7 +497,9 @@ BaseCache::recvTimingResp(PacketPtr pkt)
         DPRINTF(Cache, "Block for addr %#llx being updated in Cache\n",
                 pkt->getAddr());
 
-        blk = handleFill(pkt, blk, writebacks, mshr->allocOnFill());
+        const bool allocate = (writeAllocator && mshr->wasWholeLineWrite) ?
+            writeAllocator->allocate() : mshr->allocOnFill();
+        blk = handleFill(pkt, blk, writebacks, allocate);
         assert(blk != nullptr);
     }
 
@@ -1461,6 +1473,29 @@ BaseCache::sendMSHRQueuePacket(MSHR* mshr)
 
     DPRINTF(Cache, "%s: MSHR %s\n", __func__, tgt_pkt->print());
 
+    // if the cache is in write coalescing mode or (additionally) in
+    // no allocation mode, and we have a write packet with an MSHR
+    // that is not a whole-line write (due to incompatible flags etc),
+    // then reset the write mode
+    if (writeAllocator && writeAllocator->coalesce() && tgt_pkt->isWrite()) {
+        if (!mshr->isWholeLineWrite()) {
+            // if we are currently write coalescing, hold on the
+            // MSHR as many cycles extra as we need to completely
+            // write a cache line
+            if (writeAllocator->delay(mshr->blkAddr)) {
+                Tick delay = blkSize / tgt_pkt->getSize() * clockPeriod();
+                DPRINTF(CacheVerbose, "Delaying pkt %s %llu ticks to allow "
+                        "for write coalescing\n", tgt_pkt->print(), delay);
+                mshrQueue.delay(mshr, delay);
+                return false;
+            } else {
+                writeAllocator->reset();
+            }
+        } else {
+            writeAllocator->resetDelay(mshr->blkAddr);
+        }
+    }
+
     CacheBlk *blk = tags->findBlock(mshr->blkAddr, mshr->isSecure);
 
     // either a prefetch that is not present upstream, or a normal
@@ -2357,3 +2392,43 @@ BaseCache::MemSidePort::MemSidePort(const std::string &_name,
       _snoopRespQueue(*_cache, *this, _label), cache(_cache)
 {
 }
+
+void
+WriteAllocator::updateMode(Addr write_addr, unsigned write_size,
+                           Addr blk_addr)
+{
+    // check if we are continuing where the last write ended
+    if (nextAddr == write_addr) {
+        delayCtr[blk_addr] = delayThreshold;
+        // stop if we have already saturated
+        if (mode != WriteMode::NO_ALLOCATE) {
+            byteCount += write_size;
+            // switch to streaming mode if we have passed the lower
+            // threshold
+            if (mode == WriteMode::ALLOCATE &&
+                byteCount > coalesceLimit) {
+                mode = WriteMode::COALESCE;
+                DPRINTF(Cache, "Switched to write coalescing\n");
+            } else if (mode == WriteMode::COALESCE &&
+                       byteCount > noAllocateLimit) {
+                // and continue and switch to non-allocating mode if we
+                // pass the upper threshold
+                mode = WriteMode::NO_ALLOCATE;
+                DPRINTF(Cache, "Switched to write-no-allocate\n");
+            }
+        }
+    } else {
+        // we did not see a write matching the previous one, start
+        // over again
+        byteCount = write_size;
+        mode = WriteMode::ALLOCATE;
+        resetDelay(blk_addr);
+    }
+    nextAddr = write_addr + write_size;
+}
+
+WriteAllocator*
+WriteAllocatorParams::create()
+{
+    return new WriteAllocator(this);
+}
diff --git a/src/mem/cache/base.hh b/src/mem/cache/base.hh
index 47218f828..b9fd7f943 100644
--- a/src/mem/cache/base.hh
+++ b/src/mem/cache/base.hh
@@ -73,6 +73,7 @@
 #include "mem/packet_queue.hh"
 #include "mem/qport.hh"
 #include "mem/request.hh"
+#include "params/WriteAllocator.hh"
 #include "sim/eventq.hh"
 #include "sim/serialize.hh"
 #include "sim/sim_exit.hh"
@@ -329,6 +330,22 @@ class BaseCache : public MemObject
     const bool prefetchOnAccess;
 
     /**
+     * The writeAllocator drive optimizations for streaming writes.
+     * It first determines whether a WriteReq MSHR should be delayed,
+     * thus ensuring that we wait longer in cases when we are write
+     * coalescing and allowing all the bytes of the line to be written
+     * before the MSHR packet is sent downstream. This works in unison
+     * with the tracking in the MSHR to check if the entire line is
+     * written. The write mode also affects the behaviour on filling
+     * any whole-line writes. Normally the cache allocates the line
+     * when receiving the InvalidateResp, but after seeing enough
+     * consecutive lines we switch to using the tempBlock, and thus
+     * end up not allocating the line, and instead turning the
+     * whole-line write into a writeback straight away.
+     */
+    WriteAllocator * const writeAllocator;
+
+    /**
      * Temporary cache block for occasional transitory use.  We use
      * the tempBlock to fill when allocation fails (e.g., when there
      * is an outstanding request that accesses the victim block) or
@@ -1161,4 +1178,136 @@ class BaseCache : public MemObject
 
 };
 
+/**
+ * The write allocator inspects write packets and detects streaming
+ * patterns. The write allocator supports a single stream where writes
+ * are expected to access consecutive locations and keeps track of
+ * size of the area covered by the concecutive writes in byteCount.
+ *
+ * 1) When byteCount has surpassed the coallesceLimit the mode
+ * switches from ALLOCATE to COALESCE where writes should be delayed
+ * until the whole block is written at which point a single packet
+ * (whole line write) can service them.
+ *
+ * 2) When byteCount has also exceeded the noAllocateLimit (whole
+ * line) we switch to NO_ALLOCATE when writes should not allocate in
+ * the cache but rather send a whole line write to the memory below.
+ */
+class WriteAllocator : public SimObject {
+  public:
+    WriteAllocator(const WriteAllocatorParams *p) :
+        SimObject(p),
+        coalesceLimit(p->coalesce_limit * p->block_size),
+        noAllocateLimit(p->no_allocate_limit * p->block_size),
+        delayThreshold(p->delay_threshold)
+    {
+        reset();
+    }
+
+    /**
+     * Should writes be coalesced? This is true if the mode is set to
+     * NO_ALLOCATE.
+     *
+     * @return return true if the cache should coalesce writes.
+     */
+    bool coalesce() const {
+        return mode != WriteMode::ALLOCATE;
+    }
+
+    /**
+     * Should writes allocate?
+     *
+     * @return return true if the cache should not allocate for writes.
+     */
+    bool allocate() const {
+        return mode != WriteMode::NO_ALLOCATE;
+    }
+
+    /**
+     * Reset the write allocator state, meaning that it allocates for
+     * writes and has not recorded any information about qualifying
+     * writes that might trigger a switch to coalescing and later no
+     * allocation.
+     */
+    void reset() {
+        mode = WriteMode::ALLOCATE;
+        byteCount = 0;
+        nextAddr = 0;
+    }
+
+    /**
+     * Access whether we need to delay the current write.
+     *
+     * @param blk_addr The block address the packet writes to
+     * @return true if the current packet should be delayed
+     */
+    bool delay(Addr blk_addr) {
+        if (delayCtr[blk_addr] > 0) {
+            --delayCtr[blk_addr];
+            return true;
+        } else {
+            return false;
+        }
+    }
+
+    /**
+     * Clear delay counter for the input block
+     *
+     * @param blk_addr The accessed cache block
+     */
+    void resetDelay(Addr blk_addr) {
+        delayCtr.erase(blk_addr);
+    }
+
+    /**
+     * Update the write mode based on the current write
+     * packet. This method compares the packet's address with any
+     * current stream, and updates the tracking and the mode
+     * accordingly.
+     *
+     * @param write_addr Start address of the write request
+     * @param write_size Size of the write request
+     * @param blk_addr The block address that this packet writes to
+     */
+    void updateMode(Addr write_addr, unsigned write_size, Addr blk_addr);
+
+  private:
+    /**
+     * The current mode for write coalescing and allocation, either
+     * normal operation (ALLOCATE), write coalescing (COALESCE), or
+     * write coalescing without allocation (NO_ALLOCATE).
+     */
+    enum class WriteMode : char {
+        ALLOCATE,
+        COALESCE,
+        NO_ALLOCATE,
+    };
+    WriteMode mode;
+
+    /** Address to match writes against to detect streams. */
+    Addr nextAddr;
+
+    /**
+     * Bytes written contiguously. Saturating once we no longer
+     * allocate.
+     */
+    uint32_t byteCount;
+
+    /**
+     * Limits for when to switch between the different write modes.
+     */
+    const uint32_t coalesceLimit;
+    const uint32_t noAllocateLimit;
+    /**
+     * The number of times the allocator will delay an WriteReq MSHR.
+     */
+    const uint32_t delayThreshold;
+
+    /**
+     * Keep track of the number of times the allocator has delayed an
+     * WriteReq MSHR.
+     */
+    std::unordered_map<Addr, Counter> delayCtr;
+};
+
 #endif //__MEM_CACHE_BASE_HH__
diff --git a/src/mem/cache/cache.cc b/src/mem/cache/cache.cc
index d38680eb8..fb0eb9d05 100644
--- a/src/mem/cache/cache.cc
+++ b/src/mem/cache/cache.cc
@@ -623,8 +623,9 @@ Cache::handleAtomicReqMiss(PacketPtr pkt, CacheBlk *&blk,
 
                 // write-line request to the cache that promoted
                 // the write to a whole line
-                blk = handleFill(bus_pkt, blk, writebacks,
-                                 allocOnFill(pkt->cmd));
+                const bool allocate = allocOnFill(pkt->cmd) &&
+                    (!writeAllocator || writeAllocator->allocate());
+                blk = handleFill(bus_pkt, blk, writebacks, allocate);
                 assert(blk != NULL);
                 is_invalidate = false;
                 satisfyRequest(pkt, blk);