mem: Co-ordination of CMOs in the xbar

A clean packet request serving a cache maintenance operation (CMO) visits all memories down to the specified xbar. The visited caches invalidate their copy (if the CMO is invalidating) and if a dirty copy is found a write packet writes the dirty data to the memory level below the specified xbar. A response is send back when all the caches are clean and/or invalidated and the specified xbar has seen the write packet. This patch adds the following functionality in the xbar: 1) Accounts for the cache clean requests that go through the xbar 2) Generates the cache clean response when both the cache clean request and the corresponding writeclean packet has crossed the destination xbar. Previously transactions in the xbar were identified using the pointer of the original request. Cache clean transactions comprise of two different packets, the clean request and the writeclean, and therefore have different request pointers. This patch adds support for custom transaction IDs that by default take the value of the request pointer but can be overriden by the contructor. This allows the clean request and writeclean share the same id which the coherent xbar uses to co-ordinate them and send the response in a timely manner. Change-Id: I80db76386a1caded38dc66e6e18f930c3bb800ff Reviewed-by: Stephan Diestelhorst <stephan.diestelhorst@arm.com> Reviewed-on: https://gem5-review.googlesource.com/5051 Maintainer: Nikos Nikoleris <nikos.nikoleris@arm.com> Reviewed-by: Jason Lowe-Power <jason@lowepower.com>
author: Nikos Nikoleris <nikos.nikoleris@arm.com> 2016-09-22 12:02:29 +0100
committer: Nikos Nikoleris <nikos.nikoleris@arm.com> 2017-12-05 11:47:01 +0000
commit: 2f4fb22f242b897568d5cbf0e6bc6a77f036f44a (patch)
tree: 9396a0270f7dde43d07453728aea2ea1bd27ad9b
parent: 9a49827a44853679b737b8ee4255ce22d5fbfa6c (diff)
download: gem5-2f4fb22f242b897568d5cbf0e6bc6a77f036f44a.tar.xz
6 files changed, 135 insertions, 22 deletions
diff --git a/src/mem/cache/cache.cc b/src/mem/cache/cache.cc
index b0e904d39..a83f8ab12 100644
--- a/src/mem/cache/cache.cc
+++ b/src/mem/cache/cache.cc
@@ -1092,7 +1092,7 @@ Cache::recvAtomic(PacketPtr pkt)
         // until the point of reference.
         DPRINTF(CacheVerbose, "%s: packet %s found block: %s\n",
                 __func__, pkt->print(), blk->print());
-        PacketPtr wb_pkt = writecleanBlk(blk, pkt->req->getDest());
+        PacketPtr wb_pkt = writecleanBlk(blk, pkt->req->getDest(), pkt->id);
         writebacks.push_back(wb_pkt);
         pkt->setSatisfied();
     }
@@ -1679,7 +1679,7 @@ Cache::writebackBlk(CacheBlk *blk)
 }
 
 PacketPtr
-Cache::writecleanBlk(CacheBlk *blk, Request::Flags dest)
+Cache::writecleanBlk(CacheBlk *blk, Request::Flags dest, PacketId id)
 {
     Request *req = new Request(tags->regenerateBlkAddr(blk->tag, blk->set),
                                blkSize, 0, Request::wbMasterId);
@@ -1688,7 +1688,7 @@ Cache::writecleanBlk(CacheBlk *blk, Request::Flags dest)
     }
     req->taskId(blk->task_id);
     blk->task_id = ContextSwitchTaskId::Unknown;
-    PacketPtr pkt = new Packet(req, MemCmd::WriteClean);
+    PacketPtr pkt = new Packet(req, MemCmd::WriteClean, blkSize, id);
     DPRINTF(Cache, "Create %s writable: %d, dirty: %d\n", pkt->print(),
             blk->isWritable(), blk->isDirty());
     // make sure the block is not marked dirty
@@ -2093,7 +2093,7 @@ Cache::handleSnoop(PacketPtr pkt, CacheBlk *blk, bool is_timing,
         if (blk_valid && blk->isDirty()) {
             DPRINTF(CacheVerbose, "%s: packet (snoop) %s found block: %s\n",
                     __func__, pkt->print(), blk->print());
-            PacketPtr wb_pkt = writecleanBlk(blk, pkt->req->getDest());
+            PacketPtr wb_pkt = writecleanBlk(blk, pkt->req->getDest(), pkt->id);
             PacketList writebacks;
             writebacks.push_back(wb_pkt);
 
@@ -2643,7 +2643,8 @@ Cache::sendMSHRQueuePacket(MSHR* mshr)
             // until the point of reference.
             DPRINTF(CacheVerbose, "%s: packet %s found block: %s\n",
                     __func__, pkt->print(), blk->print());
-            PacketPtr wb_pkt = writecleanBlk(blk, pkt->req->getDest());
+            PacketPtr wb_pkt = writecleanBlk(blk, pkt->req->getDest(),
+                                             pkt->id);
             PacketList writebacks;
             writebacks.push_back(wb_pkt);
             doWritebacks(writebacks, 0);
diff --git a/src/mem/cache/cache.hh b/src/mem/cache/cache.hh
index cd3a1d8e5..4d840be27 100644
--- a/src/mem/cache/cache.hh
+++ b/src/mem/cache/cache.hh
@@ -458,7 +458,7 @@ class Cache : public BaseCache
      * @param dest The destination of this clean operation
      * @return The write clean packet for the block.
      */
-    PacketPtr writecleanBlk(CacheBlk *blk, Request::Flags dest = 0);
+    PacketPtr writecleanBlk(CacheBlk *blk, Request::Flags dest, PacketId id);
 
     /**
      * Create a CleanEvict request for the given block.
diff --git a/src/mem/cache/mshr.cc b/src/mem/cache/mshr.cc
index d89adef19..f1a9b985e 100644
--- a/src/mem/cache/mshr.cc
+++ b/src/mem/cache/mshr.cc
@@ -426,7 +426,7 @@ MSHR::handleSnoop(PacketPtr pkt, Counter _order)
         // the packet and the request as part of handling the deferred
         // snoop.
         PacketPtr cp_pkt = will_respond ? new Packet(pkt, true, true) :
-            new Packet(new Request(*pkt->req), pkt->cmd, blkSize);
+            new Packet(new Request(*pkt->req), pkt->cmd, blkSize, pkt->id);
 
         if (will_respond) {
             // we are the ordering point, and will consequently
diff --git a/src/mem/coherent_xbar.cc b/src/mem/coherent_xbar.cc
index e946134d3..02b3122d9 100644
--- a/src/mem/coherent_xbar.cc
+++ b/src/mem/coherent_xbar.cc
@@ -40,6 +40,7 @@
  * Authors: Ali Saidi
  *          Andreas Hansson
  *          William Wang
+ *          Nikos Nikoleris
  */
 
 /**
@@ -194,6 +195,22 @@ CoherentXBar::recvTimingReq(PacketPtr pkt, PortID slave_port_id)
     if (snoop_caches) {
         assert(pkt->snoopDelay == 0);
 
+        if (pkt->isClean() && !is_destination) {
+            // before snooping we need to make sure that the memory
+            // below is not busy and the cache clean request can be
+            // forwarded to it
+            if (!masterPorts[master_port_id]->tryTiming(pkt)) {
+                DPRINTF(CoherentXBar, "%s: src %s packet %s RETRY\n", __func__,
+                        src_port->name(), pkt->print());
+
+                // update the layer state and schedule an idle event
+                reqLayers[master_port_id]->failedTiming(src_port,
+                                                        clockEdge(Cycles(1)));
+                return false;
+            }
+        }
+
+
         // the packet is a memory-mapped request and should be
         // broadcasted to our snoopers but the source
         if (snoopFilter) {
@@ -342,21 +359,76 @@ CoherentXBar::recvTimingReq(PacketPtr pkt, PortID slave_port_id)
         // queue the packet for deletion
         pendingDelete.reset(pkt);
 
+    // normally we respond to the packet we just received if we need to
+    PacketPtr rsp_pkt = pkt;
+    PortID rsp_port_id = slave_port_id;
+
+    // If this is the destination of the cache clean operation the
+    // crossbar is responsible for responding. This crossbar will
+    // respond when the cache clean is complete. A cache clean
+    // is complete either:
+    // * direcly, if no cache above had a dirty copy of the block
+    //   as indicated by the satisfied flag of the packet, or
+    // * when the crossbar has seen both the cache clean request
+    //   (CleanSharedReq, CleanInvalidReq) and the corresponding
+    //   write (WriteClean) which updates the block in the memory
+    //   below.
+    if (success &&
+        ((pkt->isClean() && pkt->satisfied()) ||
+         pkt->cmd == MemCmd::WriteClean) &&
+        is_destination) {
+        PacketPtr deferred_rsp = pkt->isWrite() ? nullptr : pkt;
+        auto cmo_lookup = outstandingCMO.find(pkt->id);
+        if (cmo_lookup != outstandingCMO.end()) {
+            // the cache clean request has already reached this xbar
+            respond_directly = true;
+            if (pkt->isWrite()) {
+                rsp_pkt = cmo_lookup->second;
+                assert(rsp_pkt);
+
+                // determine the destination
+                const auto route_lookup = routeTo.find(rsp_pkt->req);
+                assert(route_lookup != routeTo.end());
+                rsp_port_id = route_lookup->second;
+                assert(rsp_port_id != InvalidPortID);
+                assert(rsp_port_id < respLayers.size());
+                // remove the request from the routing table
+                routeTo.erase(route_lookup);
+            }
+            outstandingCMO.erase(cmo_lookup);
+        } else {
+            respond_directly = false;
+            outstandingCMO.emplace(pkt->id, deferred_rsp);
+            if (!pkt->isWrite()) {
+                assert(routeTo.find(pkt->req) == routeTo.end());
+                routeTo[pkt->req] = slave_port_id;
+
+                panic_if(routeTo.size() > 512,
+                         "Routing table exceeds 512 packets\n");
+            }
+        }
+    }
+
+
     if (respond_directly) {
-        assert(pkt->needsResponse());
+        assert(rsp_pkt->needsResponse());
         assert(success);
 
-        pkt->makeResponse();
+        rsp_pkt->makeResponse();
 
         if (snoopFilter && !system->bypassCaches()) {
             // let the snoop filter inspect the response and update its state
-            snoopFilter->updateResponse(pkt, *slavePorts[slave_port_id]);
+            snoopFilter->updateResponse(rsp_pkt, *slavePorts[rsp_port_id]);
         }
 
+        // we send the response after the current packet, even if the
+        // response is not for this packet (e.g. cache clean operation
+        // where both the request and the write packet have to cross
+        // the destination xbar before the response is sent.)
         Tick response_time = clockEdge() + pkt->headerDelay;
-        pkt->headerDelay = 0;
+        rsp_pkt->headerDelay = 0;
 
-        slavePorts[slave_port_id]->schedTimingResp(pkt, response_time);
+        slavePorts[rsp_port_id]->schedTimingResp(rsp_pkt, response_time);
     }
 
     return success;
@@ -754,6 +826,30 @@ CoherentXBar::recvAtomic(PacketPtr pkt, PortID slave_port_id)
         response_latency = snoop_response_latency;
     }
 
+    // If this is the destination of the cache clean operation the
+    // crossbar is responsible for responding. This crossbar will
+    // respond when the cache clean is complete. An atomic cache clean
+    // is complete when the crossbars receives the cache clean
+    // request (CleanSharedReq, CleanInvalidReq), as either:
+    // * no cache above had a dirty copy of the block as indicated by
+    //   the satisfied flag of the packet, or
+    // * the crossbar has already seen the corresponding write
+    //   (WriteClean) which updates the block in the memory below.
+    if (pkt->isClean() && isDestination(pkt) && pkt->satisfied()) {
+        auto it = outstandingCMO.find(pkt->id);
+        assert(it != outstandingCMO.end());
+        // we are responding right away
+        outstandingCMO.erase(it);
+    } else if (pkt->cmd == MemCmd::WriteClean && isDestination(pkt)) {
+        // if this is the destination of the operation, the xbar
+        // sends the responce to the cache clean operation only
+        // after having encountered the cache clean request
+        auto M5_VAR_USED ret = outstandingCMO.emplace(pkt->id, nullptr);
+        // in atomic mode we know that the WriteClean packet should
+        // precede the clean request
+        assert(ret.second);
+    }
+
     // add the response data
     if (pkt->isResponse()) {
         pkt_size = pkt->hasData() ? pkt->getSize() : 0;
@@ -988,8 +1084,13 @@ bool
 CoherentXBar::forwardPacket(const PacketPtr pkt)
 {
     // we are forwarding the packet if:
-    // 1) this is a read or a write
-    // 2) this crossbar is above the point of coherency
+    // 1) this is a cache clean request to the PoU/PoC and this
+    //    crossbar is above the PoU/PoC
+    // 2) this is a read or a write
+    // 3) this crossbar is above the point of coherency
+    if (pkt->isClean()) {
+        return !isDestination(pkt);
+    }
     return pkt->isRead() || pkt->isWrite() || !pointOfCoherency;
 }
 
diff --git a/src/mem/coherent_xbar.hh b/src/mem/coherent_xbar.hh
index 0c2907fa0..79777b998 100644
--- a/src/mem/coherent_xbar.hh
+++ b/src/mem/coherent_xbar.hh
@@ -51,6 +51,7 @@
 #ifndef __MEM_COHERENT_XBAR_HH__
 #define __MEM_COHERENT_XBAR_HH__
 
+#include <unordered_map>
 #include <unordered_set>
 
 #include "mem/snoop_filter.hh"
@@ -263,6 +264,13 @@ class CoherentXBar : public BaseXBar
     std::unordered_set<RequestPtr> outstandingSnoop;
 
     /**
+     * Store the outstanding cache maintenance that we are expecting
+     * snoop responses from so we can determine when we received all
+     * snoop responses and if any of the agents satisfied the request.
+     */
+    std::unordered_map<PacketId, PacketPtr> outstandingCMO;
+
+    /**
      * Keep a pointer to the system to be allow to querying memory system
      * properties.
      */
diff --git a/src/mem/packet.hh b/src/mem/packet.hh
index 88829d358..66625b382 100644
--- a/src/mem/packet.hh
+++ b/src/mem/packet.hh
@@ -70,6 +70,7 @@ class Packet;
 typedef Packet *PacketPtr;
 typedef uint8_t* PacketDataPtr;
 typedef std::list<PacketPtr> PacketList;
+typedef uint64_t PacketId;
 
 class MemCmd
 {
@@ -316,6 +317,8 @@ class Packet : public Printable
     /// The command field of the packet.
     MemCmd cmd;
 
+    const PacketId id;
+
     /// A pointer to the original request.
     const RequestPtr req;
 
@@ -743,9 +746,9 @@ class Packet : public Printable
      * not be valid. The command must be supplied.
      */
     Packet(const RequestPtr _req, MemCmd _cmd)
-        :  cmd(_cmd), req(_req), data(nullptr), addr(0), _isSecure(false),
-           size(0), headerDelay(0), snoopDelay(0), payloadDelay(0),
-           senderState(NULL)
+        :  cmd(_cmd), id((PacketId)_req), req(_req), data(nullptr), addr(0),
+           _isSecure(false), size(0), headerDelay(0), snoopDelay(0),
+           payloadDelay(0), senderState(NULL)
     {
         if (req->hasPaddr()) {
             addr = req->getPaddr();
@@ -763,10 +766,10 @@ class Packet : public Printable
      * a request that is for a whole block, not the address from the
      * req.  this allows for overriding the size/addr of the req.
      */
-    Packet(const RequestPtr _req, MemCmd _cmd, int _blkSize)
-        :  cmd(_cmd), req(_req), data(nullptr), addr(0), _isSecure(false),
-           headerDelay(0), snoopDelay(0), payloadDelay(0),
-           senderState(NULL)
+    Packet(const RequestPtr _req, MemCmd _cmd, int _blkSize, PacketId _id = 0)
+        :  cmd(_cmd), id(_id ? _id : (PacketId)_req), req(_req), data(nullptr),
+           addr(0), _isSecure(false), headerDelay(0), snoopDelay(0),
+           payloadDelay(0), senderState(NULL)
     {
         if (req->hasPaddr()) {
             addr = req->getPaddr() & ~(_blkSize - 1);
@@ -785,7 +788,7 @@ class Packet : public Printable
      * packet should allocate its own data.
      */
     Packet(const PacketPtr pkt, bool clear_flags, bool alloc_data)
-        :  cmd(pkt->cmd), req(pkt->req),
+        :  cmd(pkt->cmd), id(pkt->id), req(pkt->req),
            data(nullptr),
            addr(pkt->addr), _isSecure(pkt->_isSecure), size(pkt->size),
            bytesValid(pkt->bytesValid),
author	Nikos Nikoleris <nikos.nikoleris@arm.com>	2016-09-22 12:02:29 +0100
committer	Nikos Nikoleris <nikos.nikoleris@arm.com>	2017-12-05 11:47:01 +0000
commit	2f4fb22f242b897568d5cbf0e6bc6a77f036f44a (patch)
tree	9396a0270f7dde43d07453728aea2ea1bd27ad9b
parent	9a49827a44853679b737b8ee4255ce22d5fbfa6c (diff)
download	gem5-2f4fb22f242b897568d5cbf0e6bc6a77f036f44a.tar.xz