summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/mem/cache/Cache.py24
-rw-r--r--src/mem/cache/base.cc83
-rw-r--r--src/mem/cache/base.hh149
-rw-r--r--src/mem/cache/cache.cc5
4 files changed, 255 insertions, 6 deletions
diff --git a/src/mem/cache/Cache.py b/src/mem/cache/Cache.py
index 230131bdc..8ffab911b 100644
--- a/src/mem/cache/Cache.py
+++ b/src/mem/cache/Cache.py
@@ -41,6 +41,7 @@
from m5.params import *
from m5.proxy import *
+from m5.SimObject import SimObject
from MemObject import MemObject
from Prefetcher import BasePrefetcher
from ReplacementPolicies import *
@@ -51,6 +52,24 @@ from Tags import *
# exclusive.
class Clusivity(Enum): vals = ['mostly_incl', 'mostly_excl']
+class WriteAllocator(SimObject):
+ type = 'WriteAllocator'
+ cxx_header = "mem/cache/cache.hh"
+
+ # Control the limits for when the cache introduces extra delays to
+ # allow whole-line write coalescing, and eventually switches to a
+ # write-no-allocate policy.
+ coalesce_limit = Param.Unsigned(2, "Consecutive lines written before "
+ "delaying for coalescing")
+ no_allocate_limit = Param.Unsigned(12, "Consecutive lines written before"
+ " skipping allocation")
+
+ delay_threshold = Param.Unsigned(8, "Number of delay quanta imposed on an "
+ "MSHR with write requests to allow for "
+ "write coalescing")
+
+ block_size = Param.Int(Parent.cache_line_size, "block size in bytes")
+
class BaseCache(MemObject):
type = 'BaseCache'
@@ -116,6 +135,11 @@ class BaseCache(MemObject):
clusivity = Param.Clusivity('mostly_incl',
"Clusivity with upstream cache")
+ # The write allocator enables optimizations for streaming write
+ # accesses by first coalescing writes and then avoiding allocation
+ # in the current cache. Typically, this would be enabled in the
+ # data cache.
+ write_allocator = Param.WriteAllocator(NULL, "Write allocator")
class Cache(BaseCache):
type = 'Cache'
diff --git a/src/mem/cache/base.cc b/src/mem/cache/base.cc
index b292e5a25..183d93f14 100644
--- a/src/mem/cache/base.cc
+++ b/src/mem/cache/base.cc
@@ -58,6 +58,7 @@
#include "mem/cache/prefetch/base.hh"
#include "mem/cache/queue_entry.hh"
#include "params/BaseCache.hh"
+#include "params/WriteAllocator.hh"
#include "sim/core.hh"
class BaseMasterPort;
@@ -83,6 +84,7 @@ BaseCache::BaseCache(const BaseCacheParams *p, unsigned blk_size)
tags(p->tags),
prefetcher(p->prefetcher),
prefetchOnAccess(p->prefetch_on_access),
+ writeAllocator(p->write_allocator),
writebackClean(p->writeback_clean),
tempBlockWriteback(nullptr),
writebackTempBlockAtomicEvent([this]{ writebackTempBlockAtomic(); },
@@ -243,6 +245,12 @@ void
BaseCache::handleTimingReqMiss(PacketPtr pkt, MSHR *mshr, CacheBlk *blk,
Tick forward_time, Tick request_time)
{
+ if (writeAllocator &&
+ pkt && pkt->isWrite() && !pkt->req->isUncacheable()) {
+ writeAllocator->updateMode(pkt->getAddr(), pkt->getSize(),
+ pkt->getBlockAddr(blkSize));
+ }
+
if (mshr) {
/// MSHR hit
/// @note writebacks will be checked in getNextMSHR()
@@ -391,11 +399,13 @@ BaseCache::recvTimingReq(PacketPtr pkt)
// already allocated for this, we need to let the prefetcher
// know about the request
- // Don't notify prefetcher on SWPrefetch or cache maintenance
- // operations
+ // Don't notify prefetcher on SWPrefetch, cache maintenance
+ // operations or for writes that we are coaslescing.
if (prefetcher && pkt &&
!pkt->cmd.isSWPrefetch() &&
- !pkt->req->isCacheMaintenance()) {
+ !pkt->req->isCacheMaintenance() &&
+ !(writeAllocator && writeAllocator->coalesce() &&
+ pkt->isWrite())) {
next_pf_time = prefetcher->notify(pkt);
}
}
@@ -487,7 +497,9 @@ BaseCache::recvTimingResp(PacketPtr pkt)
DPRINTF(Cache, "Block for addr %#llx being updated in Cache\n",
pkt->getAddr());
- blk = handleFill(pkt, blk, writebacks, mshr->allocOnFill());
+ const bool allocate = (writeAllocator && mshr->wasWholeLineWrite) ?
+ writeAllocator->allocate() : mshr->allocOnFill();
+ blk = handleFill(pkt, blk, writebacks, allocate);
assert(blk != nullptr);
}
@@ -1461,6 +1473,29 @@ BaseCache::sendMSHRQueuePacket(MSHR* mshr)
DPRINTF(Cache, "%s: MSHR %s\n", __func__, tgt_pkt->print());
+ // if the cache is in write coalescing mode or (additionally) in
+ // no allocation mode, and we have a write packet with an MSHR
+ // that is not a whole-line write (due to incompatible flags etc),
+ // then reset the write mode
+ if (writeAllocator && writeAllocator->coalesce() && tgt_pkt->isWrite()) {
+ if (!mshr->isWholeLineWrite()) {
+ // if we are currently write coalescing, hold on the
+ // MSHR as many cycles extra as we need to completely
+ // write a cache line
+ if (writeAllocator->delay(mshr->blkAddr)) {
+ Tick delay = blkSize / tgt_pkt->getSize() * clockPeriod();
+ DPRINTF(CacheVerbose, "Delaying pkt %s %llu ticks to allow "
+ "for write coalescing\n", tgt_pkt->print(), delay);
+ mshrQueue.delay(mshr, delay);
+ return false;
+ } else {
+ writeAllocator->reset();
+ }
+ } else {
+ writeAllocator->resetDelay(mshr->blkAddr);
+ }
+ }
+
CacheBlk *blk = tags->findBlock(mshr->blkAddr, mshr->isSecure);
// either a prefetch that is not present upstream, or a normal
@@ -2357,3 +2392,43 @@ BaseCache::MemSidePort::MemSidePort(const std::string &_name,
_snoopRespQueue(*_cache, *this, _label), cache(_cache)
{
}
+
+void
+WriteAllocator::updateMode(Addr write_addr, unsigned write_size,
+ Addr blk_addr)
+{
+ // check if we are continuing where the last write ended
+ if (nextAddr == write_addr) {
+ delayCtr[blk_addr] = delayThreshold;
+ // stop if we have already saturated
+ if (mode != WriteMode::NO_ALLOCATE) {
+ byteCount += write_size;
+ // switch to streaming mode if we have passed the lower
+ // threshold
+ if (mode == WriteMode::ALLOCATE &&
+ byteCount > coalesceLimit) {
+ mode = WriteMode::COALESCE;
+ DPRINTF(Cache, "Switched to write coalescing\n");
+ } else if (mode == WriteMode::COALESCE &&
+ byteCount > noAllocateLimit) {
+ // and continue and switch to non-allocating mode if we
+ // pass the upper threshold
+ mode = WriteMode::NO_ALLOCATE;
+ DPRINTF(Cache, "Switched to write-no-allocate\n");
+ }
+ }
+ } else {
+ // we did not see a write matching the previous one, start
+ // over again
+ byteCount = write_size;
+ mode = WriteMode::ALLOCATE;
+ resetDelay(blk_addr);
+ }
+ nextAddr = write_addr + write_size;
+}
+
+WriteAllocator*
+WriteAllocatorParams::create()
+{
+ return new WriteAllocator(this);
+}
diff --git a/src/mem/cache/base.hh b/src/mem/cache/base.hh
index 47218f828..b9fd7f943 100644
--- a/src/mem/cache/base.hh
+++ b/src/mem/cache/base.hh
@@ -73,6 +73,7 @@
#include "mem/packet_queue.hh"
#include "mem/qport.hh"
#include "mem/request.hh"
+#include "params/WriteAllocator.hh"
#include "sim/eventq.hh"
#include "sim/serialize.hh"
#include "sim/sim_exit.hh"
@@ -329,6 +330,22 @@ class BaseCache : public MemObject
const bool prefetchOnAccess;
/**
+ * The writeAllocator drive optimizations for streaming writes.
+ * It first determines whether a WriteReq MSHR should be delayed,
+ * thus ensuring that we wait longer in cases when we are write
+ * coalescing and allowing all the bytes of the line to be written
+ * before the MSHR packet is sent downstream. This works in unison
+ * with the tracking in the MSHR to check if the entire line is
+ * written. The write mode also affects the behaviour on filling
+ * any whole-line writes. Normally the cache allocates the line
+ * when receiving the InvalidateResp, but after seeing enough
+ * consecutive lines we switch to using the tempBlock, and thus
+ * end up not allocating the line, and instead turning the
+ * whole-line write into a writeback straight away.
+ */
+ WriteAllocator * const writeAllocator;
+
+ /**
* Temporary cache block for occasional transitory use. We use
* the tempBlock to fill when allocation fails (e.g., when there
* is an outstanding request that accesses the victim block) or
@@ -1161,4 +1178,136 @@ class BaseCache : public MemObject
};
+/**
+ * The write allocator inspects write packets and detects streaming
+ * patterns. The write allocator supports a single stream where writes
+ * are expected to access consecutive locations and keeps track of
+ * size of the area covered by the concecutive writes in byteCount.
+ *
+ * 1) When byteCount has surpassed the coallesceLimit the mode
+ * switches from ALLOCATE to COALESCE where writes should be delayed
+ * until the whole block is written at which point a single packet
+ * (whole line write) can service them.
+ *
+ * 2) When byteCount has also exceeded the noAllocateLimit (whole
+ * line) we switch to NO_ALLOCATE when writes should not allocate in
+ * the cache but rather send a whole line write to the memory below.
+ */
+class WriteAllocator : public SimObject {
+ public:
+ WriteAllocator(const WriteAllocatorParams *p) :
+ SimObject(p),
+ coalesceLimit(p->coalesce_limit * p->block_size),
+ noAllocateLimit(p->no_allocate_limit * p->block_size),
+ delayThreshold(p->delay_threshold)
+ {
+ reset();
+ }
+
+ /**
+ * Should writes be coalesced? This is true if the mode is set to
+ * NO_ALLOCATE.
+ *
+ * @return return true if the cache should coalesce writes.
+ */
+ bool coalesce() const {
+ return mode != WriteMode::ALLOCATE;
+ }
+
+ /**
+ * Should writes allocate?
+ *
+ * @return return true if the cache should not allocate for writes.
+ */
+ bool allocate() const {
+ return mode != WriteMode::NO_ALLOCATE;
+ }
+
+ /**
+ * Reset the write allocator state, meaning that it allocates for
+ * writes and has not recorded any information about qualifying
+ * writes that might trigger a switch to coalescing and later no
+ * allocation.
+ */
+ void reset() {
+ mode = WriteMode::ALLOCATE;
+ byteCount = 0;
+ nextAddr = 0;
+ }
+
+ /**
+ * Access whether we need to delay the current write.
+ *
+ * @param blk_addr The block address the packet writes to
+ * @return true if the current packet should be delayed
+ */
+ bool delay(Addr blk_addr) {
+ if (delayCtr[blk_addr] > 0) {
+ --delayCtr[blk_addr];
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ /**
+ * Clear delay counter for the input block
+ *
+ * @param blk_addr The accessed cache block
+ */
+ void resetDelay(Addr blk_addr) {
+ delayCtr.erase(blk_addr);
+ }
+
+ /**
+ * Update the write mode based on the current write
+ * packet. This method compares the packet's address with any
+ * current stream, and updates the tracking and the mode
+ * accordingly.
+ *
+ * @param write_addr Start address of the write request
+ * @param write_size Size of the write request
+ * @param blk_addr The block address that this packet writes to
+ */
+ void updateMode(Addr write_addr, unsigned write_size, Addr blk_addr);
+
+ private:
+ /**
+ * The current mode for write coalescing and allocation, either
+ * normal operation (ALLOCATE), write coalescing (COALESCE), or
+ * write coalescing without allocation (NO_ALLOCATE).
+ */
+ enum class WriteMode : char {
+ ALLOCATE,
+ COALESCE,
+ NO_ALLOCATE,
+ };
+ WriteMode mode;
+
+ /** Address to match writes against to detect streams. */
+ Addr nextAddr;
+
+ /**
+ * Bytes written contiguously. Saturating once we no longer
+ * allocate.
+ */
+ uint32_t byteCount;
+
+ /**
+ * Limits for when to switch between the different write modes.
+ */
+ const uint32_t coalesceLimit;
+ const uint32_t noAllocateLimit;
+ /**
+ * The number of times the allocator will delay an WriteReq MSHR.
+ */
+ const uint32_t delayThreshold;
+
+ /**
+ * Keep track of the number of times the allocator has delayed an
+ * WriteReq MSHR.
+ */
+ std::unordered_map<Addr, Counter> delayCtr;
+};
+
#endif //__MEM_CACHE_BASE_HH__
diff --git a/src/mem/cache/cache.cc b/src/mem/cache/cache.cc
index d38680eb8..fb0eb9d05 100644
--- a/src/mem/cache/cache.cc
+++ b/src/mem/cache/cache.cc
@@ -623,8 +623,9 @@ Cache::handleAtomicReqMiss(PacketPtr pkt, CacheBlk *&blk,
// write-line request to the cache that promoted
// the write to a whole line
- blk = handleFill(bus_pkt, blk, writebacks,
- allocOnFill(pkt->cmd));
+ const bool allocate = allocOnFill(pkt->cmd) &&
+ (!writeAllocator || writeAllocator->allocate());
+ blk = handleFill(bus_pkt, blk, writebacks, allocate);
assert(blk != NULL);
is_invalidate = false;
satisfyRequest(pkt, blk);