diff options
-rw-r--r-- | src/mem/cache/Cache.py | 24 | ||||
-rw-r--r-- | src/mem/cache/base.cc | 83 | ||||
-rw-r--r-- | src/mem/cache/base.hh | 149 | ||||
-rw-r--r-- | src/mem/cache/cache.cc | 5 |
4 files changed, 255 insertions, 6 deletions
diff --git a/src/mem/cache/Cache.py b/src/mem/cache/Cache.py index 230131bdc..8ffab911b 100644 --- a/src/mem/cache/Cache.py +++ b/src/mem/cache/Cache.py @@ -41,6 +41,7 @@ from m5.params import * from m5.proxy import * +from m5.SimObject import SimObject from MemObject import MemObject from Prefetcher import BasePrefetcher from ReplacementPolicies import * @@ -51,6 +52,24 @@ from Tags import * # exclusive. class Clusivity(Enum): vals = ['mostly_incl', 'mostly_excl'] +class WriteAllocator(SimObject): + type = 'WriteAllocator' + cxx_header = "mem/cache/cache.hh" + + # Control the limits for when the cache introduces extra delays to + # allow whole-line write coalescing, and eventually switches to a + # write-no-allocate policy. + coalesce_limit = Param.Unsigned(2, "Consecutive lines written before " + "delaying for coalescing") + no_allocate_limit = Param.Unsigned(12, "Consecutive lines written before" + " skipping allocation") + + delay_threshold = Param.Unsigned(8, "Number of delay quanta imposed on an " + "MSHR with write requests to allow for " + "write coalescing") + + block_size = Param.Int(Parent.cache_line_size, "block size in bytes") + class BaseCache(MemObject): type = 'BaseCache' @@ -116,6 +135,11 @@ class BaseCache(MemObject): clusivity = Param.Clusivity('mostly_incl', "Clusivity with upstream cache") + # The write allocator enables optimizations for streaming write + # accesses by first coalescing writes and then avoiding allocation + # in the current cache. Typically, this would be enabled in the + # data cache. + write_allocator = Param.WriteAllocator(NULL, "Write allocator") class Cache(BaseCache): type = 'Cache' diff --git a/src/mem/cache/base.cc b/src/mem/cache/base.cc index b292e5a25..183d93f14 100644 --- a/src/mem/cache/base.cc +++ b/src/mem/cache/base.cc @@ -58,6 +58,7 @@ #include "mem/cache/prefetch/base.hh" #include "mem/cache/queue_entry.hh" #include "params/BaseCache.hh" +#include "params/WriteAllocator.hh" #include "sim/core.hh" class BaseMasterPort; @@ -83,6 +84,7 @@ BaseCache::BaseCache(const BaseCacheParams *p, unsigned blk_size) tags(p->tags), prefetcher(p->prefetcher), prefetchOnAccess(p->prefetch_on_access), + writeAllocator(p->write_allocator), writebackClean(p->writeback_clean), tempBlockWriteback(nullptr), writebackTempBlockAtomicEvent([this]{ writebackTempBlockAtomic(); }, @@ -243,6 +245,12 @@ void BaseCache::handleTimingReqMiss(PacketPtr pkt, MSHR *mshr, CacheBlk *blk, Tick forward_time, Tick request_time) { + if (writeAllocator && + pkt && pkt->isWrite() && !pkt->req->isUncacheable()) { + writeAllocator->updateMode(pkt->getAddr(), pkt->getSize(), + pkt->getBlockAddr(blkSize)); + } + if (mshr) { /// MSHR hit /// @note writebacks will be checked in getNextMSHR() @@ -391,11 +399,13 @@ BaseCache::recvTimingReq(PacketPtr pkt) // already allocated for this, we need to let the prefetcher // know about the request - // Don't notify prefetcher on SWPrefetch or cache maintenance - // operations + // Don't notify prefetcher on SWPrefetch, cache maintenance + // operations or for writes that we are coaslescing. if (prefetcher && pkt && !pkt->cmd.isSWPrefetch() && - !pkt->req->isCacheMaintenance()) { + !pkt->req->isCacheMaintenance() && + !(writeAllocator && writeAllocator->coalesce() && + pkt->isWrite())) { next_pf_time = prefetcher->notify(pkt); } } @@ -487,7 +497,9 @@ BaseCache::recvTimingResp(PacketPtr pkt) DPRINTF(Cache, "Block for addr %#llx being updated in Cache\n", pkt->getAddr()); - blk = handleFill(pkt, blk, writebacks, mshr->allocOnFill()); + const bool allocate = (writeAllocator && mshr->wasWholeLineWrite) ? + writeAllocator->allocate() : mshr->allocOnFill(); + blk = handleFill(pkt, blk, writebacks, allocate); assert(blk != nullptr); } @@ -1461,6 +1473,29 @@ BaseCache::sendMSHRQueuePacket(MSHR* mshr) DPRINTF(Cache, "%s: MSHR %s\n", __func__, tgt_pkt->print()); + // if the cache is in write coalescing mode or (additionally) in + // no allocation mode, and we have a write packet with an MSHR + // that is not a whole-line write (due to incompatible flags etc), + // then reset the write mode + if (writeAllocator && writeAllocator->coalesce() && tgt_pkt->isWrite()) { + if (!mshr->isWholeLineWrite()) { + // if we are currently write coalescing, hold on the + // MSHR as many cycles extra as we need to completely + // write a cache line + if (writeAllocator->delay(mshr->blkAddr)) { + Tick delay = blkSize / tgt_pkt->getSize() * clockPeriod(); + DPRINTF(CacheVerbose, "Delaying pkt %s %llu ticks to allow " + "for write coalescing\n", tgt_pkt->print(), delay); + mshrQueue.delay(mshr, delay); + return false; + } else { + writeAllocator->reset(); + } + } else { + writeAllocator->resetDelay(mshr->blkAddr); + } + } + CacheBlk *blk = tags->findBlock(mshr->blkAddr, mshr->isSecure); // either a prefetch that is not present upstream, or a normal @@ -2357,3 +2392,43 @@ BaseCache::MemSidePort::MemSidePort(const std::string &_name, _snoopRespQueue(*_cache, *this, _label), cache(_cache) { } + +void +WriteAllocator::updateMode(Addr write_addr, unsigned write_size, + Addr blk_addr) +{ + // check if we are continuing where the last write ended + if (nextAddr == write_addr) { + delayCtr[blk_addr] = delayThreshold; + // stop if we have already saturated + if (mode != WriteMode::NO_ALLOCATE) { + byteCount += write_size; + // switch to streaming mode if we have passed the lower + // threshold + if (mode == WriteMode::ALLOCATE && + byteCount > coalesceLimit) { + mode = WriteMode::COALESCE; + DPRINTF(Cache, "Switched to write coalescing\n"); + } else if (mode == WriteMode::COALESCE && + byteCount > noAllocateLimit) { + // and continue and switch to non-allocating mode if we + // pass the upper threshold + mode = WriteMode::NO_ALLOCATE; + DPRINTF(Cache, "Switched to write-no-allocate\n"); + } + } + } else { + // we did not see a write matching the previous one, start + // over again + byteCount = write_size; + mode = WriteMode::ALLOCATE; + resetDelay(blk_addr); + } + nextAddr = write_addr + write_size; +} + +WriteAllocator* +WriteAllocatorParams::create() +{ + return new WriteAllocator(this); +} diff --git a/src/mem/cache/base.hh b/src/mem/cache/base.hh index 47218f828..b9fd7f943 100644 --- a/src/mem/cache/base.hh +++ b/src/mem/cache/base.hh @@ -73,6 +73,7 @@ #include "mem/packet_queue.hh" #include "mem/qport.hh" #include "mem/request.hh" +#include "params/WriteAllocator.hh" #include "sim/eventq.hh" #include "sim/serialize.hh" #include "sim/sim_exit.hh" @@ -329,6 +330,22 @@ class BaseCache : public MemObject const bool prefetchOnAccess; /** + * The writeAllocator drive optimizations for streaming writes. + * It first determines whether a WriteReq MSHR should be delayed, + * thus ensuring that we wait longer in cases when we are write + * coalescing and allowing all the bytes of the line to be written + * before the MSHR packet is sent downstream. This works in unison + * with the tracking in the MSHR to check if the entire line is + * written. The write mode also affects the behaviour on filling + * any whole-line writes. Normally the cache allocates the line + * when receiving the InvalidateResp, but after seeing enough + * consecutive lines we switch to using the tempBlock, and thus + * end up not allocating the line, and instead turning the + * whole-line write into a writeback straight away. + */ + WriteAllocator * const writeAllocator; + + /** * Temporary cache block for occasional transitory use. We use * the tempBlock to fill when allocation fails (e.g., when there * is an outstanding request that accesses the victim block) or @@ -1161,4 +1178,136 @@ class BaseCache : public MemObject }; +/** + * The write allocator inspects write packets and detects streaming + * patterns. The write allocator supports a single stream where writes + * are expected to access consecutive locations and keeps track of + * size of the area covered by the concecutive writes in byteCount. + * + * 1) When byteCount has surpassed the coallesceLimit the mode + * switches from ALLOCATE to COALESCE where writes should be delayed + * until the whole block is written at which point a single packet + * (whole line write) can service them. + * + * 2) When byteCount has also exceeded the noAllocateLimit (whole + * line) we switch to NO_ALLOCATE when writes should not allocate in + * the cache but rather send a whole line write to the memory below. + */ +class WriteAllocator : public SimObject { + public: + WriteAllocator(const WriteAllocatorParams *p) : + SimObject(p), + coalesceLimit(p->coalesce_limit * p->block_size), + noAllocateLimit(p->no_allocate_limit * p->block_size), + delayThreshold(p->delay_threshold) + { + reset(); + } + + /** + * Should writes be coalesced? This is true if the mode is set to + * NO_ALLOCATE. + * + * @return return true if the cache should coalesce writes. + */ + bool coalesce() const { + return mode != WriteMode::ALLOCATE; + } + + /** + * Should writes allocate? + * + * @return return true if the cache should not allocate for writes. + */ + bool allocate() const { + return mode != WriteMode::NO_ALLOCATE; + } + + /** + * Reset the write allocator state, meaning that it allocates for + * writes and has not recorded any information about qualifying + * writes that might trigger a switch to coalescing and later no + * allocation. + */ + void reset() { + mode = WriteMode::ALLOCATE; + byteCount = 0; + nextAddr = 0; + } + + /** + * Access whether we need to delay the current write. + * + * @param blk_addr The block address the packet writes to + * @return true if the current packet should be delayed + */ + bool delay(Addr blk_addr) { + if (delayCtr[blk_addr] > 0) { + --delayCtr[blk_addr]; + return true; + } else { + return false; + } + } + + /** + * Clear delay counter for the input block + * + * @param blk_addr The accessed cache block + */ + void resetDelay(Addr blk_addr) { + delayCtr.erase(blk_addr); + } + + /** + * Update the write mode based on the current write + * packet. This method compares the packet's address with any + * current stream, and updates the tracking and the mode + * accordingly. + * + * @param write_addr Start address of the write request + * @param write_size Size of the write request + * @param blk_addr The block address that this packet writes to + */ + void updateMode(Addr write_addr, unsigned write_size, Addr blk_addr); + + private: + /** + * The current mode for write coalescing and allocation, either + * normal operation (ALLOCATE), write coalescing (COALESCE), or + * write coalescing without allocation (NO_ALLOCATE). + */ + enum class WriteMode : char { + ALLOCATE, + COALESCE, + NO_ALLOCATE, + }; + WriteMode mode; + + /** Address to match writes against to detect streams. */ + Addr nextAddr; + + /** + * Bytes written contiguously. Saturating once we no longer + * allocate. + */ + uint32_t byteCount; + + /** + * Limits for when to switch between the different write modes. + */ + const uint32_t coalesceLimit; + const uint32_t noAllocateLimit; + /** + * The number of times the allocator will delay an WriteReq MSHR. + */ + const uint32_t delayThreshold; + + /** + * Keep track of the number of times the allocator has delayed an + * WriteReq MSHR. + */ + std::unordered_map<Addr, Counter> delayCtr; +}; + #endif //__MEM_CACHE_BASE_HH__ diff --git a/src/mem/cache/cache.cc b/src/mem/cache/cache.cc index d38680eb8..fb0eb9d05 100644 --- a/src/mem/cache/cache.cc +++ b/src/mem/cache/cache.cc @@ -623,8 +623,9 @@ Cache::handleAtomicReqMiss(PacketPtr pkt, CacheBlk *&blk, // write-line request to the cache that promoted // the write to a whole line - blk = handleFill(bus_pkt, blk, writebacks, - allocOnFill(pkt->cmd)); + const bool allocate = allocOnFill(pkt->cmd) && + (!writeAllocator || writeAllocator->allocate()); + blk = handleFill(bus_pkt, blk, writebacks, allocate); assert(blk != NULL); is_invalidate = false; satisfyRequest(pkt, blk); |