9 files changed, 207 insertions, 37 deletions
diff --git a/configs/common/O3_ARM_v7a.py b/configs/common/O3_ARM_v7a.py
index 9f250f57d..02beb11d1 100644
--- a/configs/common/O3_ARM_v7a.py
+++ b/configs/common/O3_ARM_v7a.py
@@ -185,6 +185,7 @@ class O3_ARM_v7aL2(Cache):
     assoc = 16
     write_buffers = 8
     prefetch_on_access = True
+    clusivity = 'mostly_excl'
     # Simple stride prefetcher
     prefetcher = StridePrefetcher(degree=8, latency = 1)
     tags = RandomRepl()
diff --git a/src/mem/cache/Cache.py b/src/mem/cache/Cache.py
index 8ad1177e7..48e52a8d5 100644
--- a/src/mem/cache/Cache.py
+++ b/src/mem/cache/Cache.py
@@ -84,6 +84,22 @@ class BaseCache(MemObject):
 
     system = Param.System(Parent.any, "System we belong to")
 
+# Enum for cache clusivity, currently mostly inclusive or mostly
+# exclusive.
+class Clusivity(Enum): vals = ['mostly_incl', 'mostly_excl']
+
 class Cache(BaseCache):
     type = 'Cache'
     cxx_header = 'mem/cache/cache.hh'
+
+    # Control whether this cache should be mostly inclusive or mostly
+    # exclusive with respect to upstream caches. The behaviour on a
+    # fill is determined accordingly. For a mostly inclusive cache,
+    # blocks are allocated on all fill operations. Thus, L1 caches
+    # should be set as mostly inclusive even if they have no upstream
+    # caches. In the case of a mostly exclusive cache, fills are not
+    # allocating unless they came directly from a non-caching source,
+    # e.g. a table walker. Additionally, on a hit from an upstream
+    # cache a line is dropped for a mostly exclusive cache.
+    clusivity = Param.Clusivity('mostly_incl',
+                                "Clusivity with upstream cache")
diff --git a/src/mem/cache/base.hh b/src/mem/cache/base.hh
index a992583fe..cb1baa3f4 100644
--- a/src/mem/cache/base.hh
+++ b/src/mem/cache/base.hh
@@ -210,7 +210,8 @@ class BaseCache : public MemObject
         // overlap
         assert(addr == blockAlign(addr));
 
-        MSHR *mshr = mq->allocate(addr, size, pkt, time, order++);
+        MSHR *mshr = mq->allocate(addr, size, pkt, time, order++,
+                                  allocOnFill(pkt->cmd));
 
         if (mq->isFull()) {
             setBlocked((BlockedCause)mq->index);
@@ -234,6 +235,15 @@ class BaseCache : public MemObject
     }
 
     /**
+     * Determine if we should allocate on a fill or not.
+     *
+     * @param cmd Packet command being added as an MSHR target
+     *
+     * @return Whether we should allocate on a fill or not
+     */
+    virtual bool allocOnFill(MemCmd cmd) const = 0;
+
+    /**
      * Write back dirty blocks in the cache using functional accesses.
      */
     virtual void memWriteback() = 0;
diff --git a/src/mem/cache/cache.cc b/src/mem/cache/cache.cc
index a03790abc..58afdc79a 100644
--- a/src/mem/cache/cache.cc
+++ b/src/mem/cache/cache.cc
@@ -68,7 +68,11 @@ Cache::Cache(const CacheParams *p)
       tags(p->tags),
       prefetcher(p->prefetcher),
       doFastWrites(true),
-      prefetchOnAccess(p->prefetch_on_access)
+      prefetchOnAccess(p->prefetch_on_access),
+      clusivity(p->clusivity),
+      tempBlockWriteback(nullptr),
+      writebackTempBlockAtomicEvent(this, false,
+                                    EventBase::Delayed_Writeback_Pri)
 {
     tempBlock = new CacheBlk();
     tempBlock->data = new uint8_t[blkSize];
@@ -198,10 +202,10 @@ Cache::satisfyCpuSideRequest(PacketPtr pkt, CacheBlk *blk,
                 if (blk->isDirty()) {
                     pkt->assertMemInhibit();
                 }
-                // on ReadExReq we give up our copy unconditionally
-                if (blk != tempBlock)
-                    tags->invalidate(blk);
-                blk->invalidate();
+                // on ReadExReq we give up our copy unconditionally,
+                // even if this cache is mostly inclusive, we may want
+                // to revisit this
+                invalidateBlock(blk);
             } else if (blk->isWritable() && !pending_downgrade &&
                        !pkt->sharedAsserted() &&
                        pkt->cmd != MemCmd::ReadCleanReq) {
@@ -220,9 +224,30 @@ Cache::satisfyCpuSideRequest(PacketPtr pkt, CacheBlk *blk,
                     if (!deferred_response) {
                         // if we are responding immediately and can
                         // signal that we're transferring ownership
-                        // along with exclusivity, do so
+                        // (inhibit set) along with exclusivity
+                        // (shared not set), do so
                         pkt->assertMemInhibit();
+
+                        // if this cache is mostly inclusive, we keep
+                        // the block as writable (exclusive), and pass
+                        // it upwards as writable and dirty
+                        // (modified), hence we have multiple caches
+                        // considering the same block writable,
+                        // something that we get away with due to the
+                        // fact that: 1) this cache has been
+                        // considered the ordering points and
+                        // responded to all snoops up till now, and 2)
+                        // we always snoop upwards before consulting
+                        // the local cache, both on a normal request
+                        // (snooping done by the crossbar), and on a
+                        // snoop
                         blk->status &= ~BlkDirty;
+
+                        // if this cache is mostly exclusive with
+                        // respect to the cache above, drop the block
+                        if (clusivity == Enums::mostly_excl) {
+                            invalidateBlock(blk);
+                        }
                     } else {
                         // if we're responding after our own miss,
                         // there's a window where the recipient didn't
@@ -241,9 +266,10 @@ Cache::satisfyCpuSideRequest(PacketPtr pkt, CacheBlk *blk,
         // Upgrade or Invalidate, since we have it Exclusively (E or
         // M), we ack then invalidate.
         assert(pkt->isUpgrade() || pkt->isInvalidate());
-        assert(blk != tempBlock);
-        tags->invalidate(blk);
-        blk->invalidate();
+
+        // for invalidations we could be looking at the temp block
+        // (for upgrades we always allocate)
+        invalidateBlock(blk);
         DPRINTF(Cache, "%s for %s addr %#llx size %d (invalidation)\n",
                 __func__, pkt->cmdString(), pkt->getAddr(), pkt->getSize());
     }
@@ -761,7 +787,8 @@ Cache::recvTimingReq(PacketPtr pkt)
                     // buffer and to schedule an event to the queued
                     // port and also takes into account the additional
                     // delay of the xbar.
-                    mshr->allocateTarget(pkt, forward_time, order++);
+                    mshr->allocateTarget(pkt, forward_time, order++,
+                                         allocOnFill(pkt->cmd));
                     if (mshr->getNumTargets() == numTarget) {
                         noTargetMSHR = mshr;
                         setBlocked(Blocked_NoTargets);
@@ -1027,13 +1054,15 @@ Cache::recvAtomic(PacketPtr pkt)
 
                     // write-line request to the cache that promoted
                     // the write to a whole line
-                    blk = handleFill(pkt, blk, writebacks);
+                    blk = handleFill(pkt, blk, writebacks,
+                                     allocOnFill(pkt->cmd));
                     satisfyCpuSideRequest(pkt, blk);
                 } else if (bus_pkt->isRead() ||
                            bus_pkt->cmd == MemCmd::UpgradeResp) {
                     // we're updating cache state to allow us to
                     // satisfy the upstream request from the cache
-                    blk = handleFill(bus_pkt, blk, writebacks);
+                    blk = handleFill(bus_pkt, blk, writebacks,
+                                     allocOnFill(pkt->cmd));
                     satisfyCpuSideRequest(pkt, blk);
                 } else {
                     // we're satisfying the upstream request without
@@ -1056,9 +1085,34 @@ Cache::recvAtomic(PacketPtr pkt)
     // immediately rather than calling requestMemSideBus() as we do
     // there).
 
-    // Handle writebacks (from the response handling) if needed
+    // do any writebacks resulting from the response handling
     doWritebacksAtomic(writebacks);
 
+    // if we used temp block, check to see if its valid and if so
+    // clear it out, but only do so after the call to recvAtomic is
+    // finished so that any downstream observers (such as a snoop
+    // filter), first see the fill, and only then see the eviction
+    if (blk == tempBlock && tempBlock->isValid()) {
+        // the atomic CPU calls recvAtomic for fetch and load/store
+        // sequentuially, and we may already have a tempBlock
+        // writeback from the fetch that we have not yet sent
+        if (tempBlockWriteback) {
+            // if that is the case, write the prevoius one back, and
+            // do not schedule any new event
+            writebackTempBlockAtomic();
+        } else {
+            // the writeback/clean eviction happens after the call to
+            // recvAtomic has finished (but before any successive
+            // calls), so that the response handling from the fill is
+            // allowed to happen first
+            schedule(writebackTempBlockAtomicEvent, curTick());
+        }
+
+        tempBlockWriteback = blk->isDirty() ? writebackBlk(blk) :
+            cleanEvictBlk(blk);
+        blk->invalidate();
+    }
+
     if (pkt->needsResponse()) {
         pkt->makeAtomicResponse();
     }
@@ -1214,7 +1268,7 @@ Cache::recvTimingResp(PacketPtr pkt)
         DPRINTF(Cache, "Block for addr %#llx being updated in Cache\n",
                 pkt->getAddr());
 
-        blk = handleFill(pkt, blk, writebacks);
+        blk = handleFill(pkt, blk, writebacks, mshr->allocOnFill);
         assert(blk != NULL);
     }
 
@@ -1258,7 +1312,7 @@ Cache::recvTimingResp(PacketPtr pkt)
                 // deferred targets if possible
                 mshr->promoteExclusive();
                 // NB: we use the original packet here and not the response!
-                blk = handleFill(tgt_pkt, blk, writebacks);
+                blk = handleFill(tgt_pkt, blk, writebacks, mshr->allocOnFill);
                 assert(blk != NULL);
 
                 // treat as a fill, and discard the invalidation
@@ -1362,9 +1416,7 @@ Cache::recvTimingResp(PacketPtr pkt)
         // should not invalidate the block, so check if the
         // invalidation should be discarded
         if (is_invalidate || mshr->hasPostInvalidate()) {
-            assert(blk != tempBlock);
-            tags->invalidate(blk);
-            blk->invalidate();
+            invalidateBlock(blk);
         } else if (mshr->hasPostDowngrade()) {
             blk->status &= ~BlkWritable;
         }
@@ -1588,6 +1640,13 @@ Cache::allocateBlock(Addr addr, bool is_secure, PacketList &writebacks)
     return blk;
 }
 
+void
+Cache::invalidateBlock(CacheBlk *blk)
+{
+    if (blk != tempBlock)
+        tags->invalidate(blk);
+    blk->invalidate();
+}
 
 // Note that the reason we return a list of writebacks rather than
 // inserting them directly in the write buffer is that this function
@@ -1595,7 +1654,8 @@ Cache::allocateBlock(Addr addr, bool is_secure, PacketList &writebacks)
 // mode we don't mess with the write buffer (we just perform the
 // writebacks atomically once the original request is complete).
 CacheBlk*
-Cache::handleFill(PacketPtr pkt, CacheBlk *blk, PacketList &writebacks)
+Cache::handleFill(PacketPtr pkt, CacheBlk *blk, PacketList &writebacks,
+                  bool allocate)
 {
     assert(pkt->isResponse() || pkt->cmd == MemCmd::WriteLineReq);
     Addr addr = pkt->getAddr();
@@ -1619,11 +1679,14 @@ Cache::handleFill(PacketPtr pkt, CacheBlk *blk, PacketList &writebacks)
         // happens in the subsequent satisfyCpuSideRequest.
         assert(pkt->isRead() || pkt->cmd == MemCmd::WriteLineReq);
 
-        // need to do a replacement
-        blk = allocateBlock(addr, is_secure, writebacks);
+        // need to do a replacement if allocating, otherwise we stick
+        // with the temporary storage
+        blk = allocate ? allocateBlock(addr, is_secure, writebacks) : NULL;
+
         if (blk == NULL) {
-            // No replaceable block... just use temporary storage to
-            // complete the current request and then get rid of it
+            // No replaceable block or a mostly exclusive
+            // cache... just use temporary storage to complete the
+            // current request and then get rid of it
             assert(!tempBlock->isValid());
             blk = tempBlock;
             tempBlock->set = tags->extractSet(addr);
@@ -1877,6 +1940,7 @@ Cache::handleSnoop(PacketPtr pkt, CacheBlk *blk, bool is_timing,
         // applies both to reads and writes and that for writes it
         // works thanks to the fact that we still have dirty data and
         // will write it back at a later point
+        assert(!pkt->memInhibitAsserted());
         pkt->assertMemInhibit();
         if (have_exclusive) {
             // in the case of an uncacheable request there is no point
@@ -1911,9 +1975,7 @@ Cache::handleSnoop(PacketPtr pkt, CacheBlk *blk, bool is_timing,
     // Do this last in case it deallocates block data or something
     // like that
     if (invalidate) {
-        if (blk != tempBlock)
-            tags->invalidate(blk);
-        blk->invalidate();
+        invalidateBlock(blk);
     }
 
     DPRINTF(Cache, "new state is %s\n", blk->print());
diff --git a/src/mem/cache/cache.hh b/src/mem/cache/cache.hh
index ae9e7e694..6da837003 100644
--- a/src/mem/cache/cache.hh
+++ b/src/mem/cache/cache.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2012-2014 ARM Limited
+ * Copyright (c) 2012-2015 ARM Limited
  * All rights reserved.
  *
  * The license below extends only to copyright in the software and shall
@@ -53,6 +53,7 @@
 #define __MEM_CACHE_CACHE_HH__
 
 #include "base/misc.hh" // fatal, panic, and warn
+#include "enums/Clusivity.hh"
 #include "mem/cache/base.hh"
 #include "mem/cache/blk.hh"
 #include "mem/cache/mshr.hh"
@@ -194,6 +195,13 @@ class Cache : public BaseCache
      */
     const bool prefetchOnAccess;
 
+     /**
+     * Clusivity with respect to the upstream cache, determining if we
+     * fill into both this cache and the cache above on a miss. Note
+     * that we currently do not support strict clusivity policies.
+     */
+    const Enums::Clusivity clusivity;
+
     /**
      * Upstream caches need this packet until true is returned, so
      * hold it for deletion until a subsequent call
@@ -201,6 +209,35 @@ class Cache : public BaseCache
     std::unique_ptr<Packet> pendingDelete;
 
     /**
+     * Writebacks from the tempBlock, resulting on the response path
+     * in atomic mode, must happen after the call to recvAtomic has
+     * finished (for the right ordering of the packets). We therefore
+     * need to hold on to the packets, and have a method and an event
+     * to send them.
+     */
+    PacketPtr tempBlockWriteback;
+
+    /**
+     * Send the outstanding tempBlock writeback. To be called after
+     * recvAtomic finishes in cases where the block we filled is in
+     * fact the tempBlock, and now needs to be written back.
+     */
+    void writebackTempBlockAtomic() {
+        assert(tempBlockWriteback != nullptr);
+        PacketList writebacks{tempBlockWriteback};
+        doWritebacksAtomic(writebacks);
+        tempBlockWriteback = nullptr;
+    }
+
+    /**
+     * An event to writeback the tempBlock after recvAtomic
+     * finishes. To avoid other calls to recvAtomic getting in
+     * between, we create this event with a higher priority.
+     */
+    EventWrapper<Cache, &Cache::writebackTempBlockAtomic> \
+        writebackTempBlockAtomicEvent;
+
+    /**
      * Does all the processing necessary to perform the provided request.
      * @param pkt The memory request to perform.
      * @param blk The cache block to be updated.
@@ -226,17 +263,47 @@ class Cache : public BaseCache
     CacheBlk *allocateBlock(Addr addr, bool is_secure, PacketList &writebacks);
 
     /**
+     * Invalidate a cache block.
+     *
+     * @param blk Block to invalidate
+     */
+    void invalidateBlock(CacheBlk *blk);
+
+    /**
      * Populates a cache block and handles all outstanding requests for the
      * satisfied fill request. This version takes two memory requests. One
      * contains the fill data, the other is an optional target to satisfy.
      * @param pkt The memory request with the fill data.
      * @param blk The cache block if it already exists.
      * @param writebacks List for any writebacks that need to be performed.
+     * @param allocate Whether to allocate a block or use the temp block
      * @return Pointer to the new cache block.
      */
     CacheBlk *handleFill(PacketPtr pkt, CacheBlk *blk,
-                        PacketList &writebacks);
+                         PacketList &writebacks, bool allocate);
 
+    /**
+     * Determine whether we should allocate on a fill or not. If this
+     * cache is mostly inclusive with regards to the upstream cache(s)
+     * we always allocate (for any non-forwarded and cacheable
+     * requests). In the case of a mostly exclusive cache, we allocate
+     * on fill if the packet did not come from a cache, thus if we:
+     * are dealing with a whole-line write (the latter behaves much
+     * like a writeback), the original target packet came from a
+     * non-caching source, or if we are performing a prefetch or LLSC.
+     *
+     * @param cmd Command of the incoming requesting packet
+     * @return Whether we should allocate on the fill
+     */
+    inline bool allocOnFill(MemCmd cmd) const
+    {
+        return clusivity == Enums::mostly_incl ||
+            cmd == MemCmd::WriteLineReq ||
+            cmd == MemCmd::ReadReq ||
+            cmd == MemCmd::WriteReq ||
+            cmd.isPrefetch() ||
+            cmd.isLLSC();
+    }
 
     /**
      * Performs the access specified by the request.
diff --git a/src/mem/cache/mshr.cc b/src/mem/cache/mshr.cc
index f71ff6524..b58c256cd 100644
--- a/src/mem/cache/mshr.cc
+++ b/src/mem/cache/mshr.cc
@@ -66,7 +66,8 @@ MSHR::MSHR() : readyTime(0), _isUncacheable(false), downstreamPending(false),
                postInvalidate(false), postDowngrade(false),
                queue(NULL), order(0), blkAddr(0),
                blkSize(0), isSecure(false), inService(false),
-               isForward(false), threadNum(InvalidThreadID), data(NULL)
+               isForward(false), allocOnFill(false),
+               threadNum(InvalidThreadID), data(NULL)
 {
 }
 
@@ -202,7 +203,7 @@ MSHR::TargetList::print(std::ostream &os, int verbosity,
 
 void
 MSHR::allocate(Addr blk_addr, unsigned blk_size, PacketPtr target,
-               Tick when_ready, Counter _order)
+               Tick when_ready, Counter _order, bool alloc_on_fill)
 {
     blkAddr = blk_addr;
     blkSize = blk_size;
@@ -211,6 +212,7 @@ MSHR::allocate(Addr blk_addr, unsigned blk_size, PacketPtr target,
     order = _order;
     assert(target);
     isForward = false;
+    allocOnFill = alloc_on_fill;
     _isUncacheable = target->req->isUncacheable();
     inService = false;
     downstreamPending = false;
@@ -274,7 +276,8 @@ MSHR::deallocate()
  * Adds a target to an MSHR
  */
 void
-MSHR::allocateTarget(PacketPtr pkt, Tick whenReady, Counter _order)
+MSHR::allocateTarget(PacketPtr pkt, Tick whenReady, Counter _order,
+                     bool alloc_on_fill)
 {
     // assume we'd never issue a prefetch when we've got an
     // outstanding miss
@@ -285,6 +288,10 @@ MSHR::allocateTarget(PacketPtr pkt, Tick whenReady, Counter _order)
     // have targets addded if originally allocated uncacheable
     assert(!_isUncacheable);
 
+    // potentially re-evaluate whether we should allocate on a fill or
+    // not
+    allocOnFill = allocOnFill || alloc_on_fill;
+
     // if there's a request already in service for this MSHR, we will
     // have to defer the new target until after the response if any of
     // the following are true:
@@ -478,6 +485,7 @@ MSHR::print(std::ostream &os, int verbosity, const std::string &prefix) const
              prefix, blkAddr, blkAddr + blkSize - 1,
              isSecure ? "s" : "ns",
              isForward ? "Forward" : "",
+             allocOnFill ? "AllocOnFill" : "",
              isForwardNoResponse() ? "ForwNoResp" : "",
              needsExclusive() ? "Excl" : "",
              _isUncacheable ? "Unc" : "",
diff --git a/src/mem/cache/mshr.hh b/src/mem/cache/mshr.hh
index 11ca4db40..45d7628fd 100644
--- a/src/mem/cache/mshr.hh
+++ b/src/mem/cache/mshr.hh
@@ -161,6 +161,9 @@ class MSHR : public Packet::SenderState, public Printable
     /** True if the request is just a simple forward from an upper level */
     bool isForward;
 
+    /** Keep track of whether we should allocate on fill or not */
+    bool allocOnFill;
+
     /** The pending* and post* flags are only valid if inService is
      *  true.  Using the accessor functions lets us detect if these
      *  flags are accessed improperly.
@@ -218,9 +221,10 @@ class MSHR : public Packet::SenderState, public Printable
      * @param pkt The original miss.
      * @param when_ready When should the MSHR be ready to act upon.
      * @param _order The logical order of this MSHR
+     * @param alloc_on_fill Should the cache allocate a block on fill
      */
     void allocate(Addr blk_addr, unsigned blk_size, PacketPtr pkt,
-                  Tick when_ready, Counter _order);
+                  Tick when_ready, Counter _order, bool alloc_on_fill);
 
     bool markInService(bool pending_dirty_resp);
 
@@ -235,7 +239,8 @@ class MSHR : public Packet::SenderState, public Printable
      * Add a request to the list of targets.
      * @param target The target.
      */
-    void allocateTarget(PacketPtr target, Tick when, Counter order);
+    void allocateTarget(PacketPtr target, Tick when, Counter order,
+                        bool alloc_on_fill);
     bool handleSnoop(PacketPtr target, Counter order);
 
     /** A simple constructor. */
diff --git a/src/mem/cache/mshr_queue.cc b/src/mem/cache/mshr_queue.cc
index 47f044d63..3aa5d85be 100644
--- a/src/mem/cache/mshr_queue.cc
+++ b/src/mem/cache/mshr_queue.cc
@@ -146,14 +146,14 @@ MSHRQueue::addToReadyList(MSHR *mshr)
 
 MSHR *
 MSHRQueue::allocate(Addr blk_addr, unsigned blk_size, PacketPtr pkt,
-                    Tick when_ready, Counter order)
+                    Tick when_ready, Counter order, bool alloc_on_fill)
 {
     assert(!freeList.empty());
     MSHR *mshr = freeList.front();
     assert(mshr->getNumTargets() == 0);
     freeList.pop_front();
 
-    mshr->allocate(blk_addr, blk_size, pkt, when_ready, order);
+    mshr->allocate(blk_addr, blk_size, pkt, when_ready, order, alloc_on_fill);
     mshr->allocIter = allocatedList.insert(allocatedList.end(), mshr);
     mshr->readyIter = addToReadyList(mshr);
 
diff --git a/src/mem/cache/mshr_queue.hh b/src/mem/cache/mshr_queue.hh
index eebfed827..29191a358 100644
--- a/src/mem/cache/mshr_queue.hh
+++ b/src/mem/cache/mshr_queue.hh
@@ -152,13 +152,14 @@ class MSHRQueue : public Drainable
      * @param pkt The original miss.
      * @param when_ready When should the MSHR be ready to act upon.
      * @param order The logical order of this MSHR
+     * @param alloc_on_fill Should the cache allocate a block on fill
      *
      * @return The a pointer to the MSHR allocated.
      *
      * @pre There are free entries.
      */
     MSHR *allocate(Addr blk_addr, unsigned blk_size, PacketPtr pkt,
-                   Tick when_ready, Counter order);
+                   Tick when_ready, Counter order, bool alloc_on_fill);
 
     /**
      * Removes the given MSHR from the queue. This places the MSHR on the