8 files changed, 207 insertions, 126 deletions
diff --git a/configs/example/memtest.py b/configs/example/memtest.py
index a3298890b..97bf79dff 100644
--- a/configs/example/memtest.py
+++ b/configs/example/memtest.py
@@ -299,6 +299,7 @@ make_cache_level(cachespec, cache_proto, len(cachespec), None)
 # Connect the lowest level crossbar to the memory
 last_subsys = getattr(system, 'l%dsubsys0' % len(cachespec))
 last_subsys.xbar.master = system.physmem.port
+last_subsys.xbar.point_of_coherency = True
 
 root = Root(full_system = False, system = system)
 if options.atomic:
diff --git a/src/mem/XBar.py b/src/mem/XBar.py
index 8614519b3..674f9262e 100644
--- a/src/mem/XBar.py
+++ b/src/mem/XBar.py
@@ -100,6 +100,12 @@ class CoherentXBar(BaseXBar):
     # An optional snoop filter
     snoop_filter = Param.SnoopFilter(NULL, "Selected snoop filter")
 
+    # Determine how this crossbar handles packets where caches have
+    # already committed to responding, by establishing if the crossbar
+    # is the point of coherency or not.
+    point_of_coherency = Param.Bool(False, "Consider this crossbar the " \
+                                    "point of coherency")
+
     system = Param.System(Parent.any, "System that the crossbar belongs to.")
 
 class SnoopFilter(SimObject):
@@ -147,6 +153,11 @@ class SystemXBar(CoherentXBar):
     response_latency = 2
     snoop_response_latency = 4
 
+    # This specialisation of the coherent crossbar is to be considered
+    # the point of coherency, as there are no (coherent) downstream
+    # caches.
+    point_of_coherency = True
+
 # In addition to the system interconnect, we typically also have one
 # or more on-chip I/O crossbars. Note that at some point we might want
 # to also define an off-chip I/O crossbar such as PCIe.
diff --git a/src/mem/bridge.cc b/src/mem/bridge.cc
index 226647fdc..8a209e8b7 100644
--- a/src/mem/bridge.cc
+++ b/src/mem/bridge.cc
@@ -154,14 +154,8 @@ Bridge::BridgeSlavePort::recvTimingReq(PacketPtr pkt)
     DPRINTF(Bridge, "recvTimingReq: %s addr 0x%x\n",
             pkt->cmdString(), pkt->getAddr());
 
-    // if a cache is responding, sink the packet without further
-    // action, also discard any packet that is not a read or a write
-    if (pkt->cacheResponding() ||
-        !(pkt->isWrite() || pkt->isRead())) {
-        assert(!pkt->needsResponse());
-        pendingDelete.reset(pkt);
-        return true;
-    }
+    panic_if(pkt->cacheResponding(), "Should not see packets where cache "
+             "is responding");
 
     // we should not get a new request after committing to retry the
     // current one, but unfortunately the CPU violates this rule, so
@@ -352,6 +346,9 @@ Bridge::BridgeSlavePort::recvRespRetry()
 Tick
 Bridge::BridgeSlavePort::recvAtomic(PacketPtr pkt)
 {
+    panic_if(pkt->cacheResponding(), "Should not see packets where cache "
+             "is responding");
+
     return delay * bridge.clockPeriod() + masterPort.sendAtomic(pkt);
 }
 
diff --git a/src/mem/cache/cache.cc b/src/mem/cache/cache.cc
index 189fd4cab..e9b909646 100644
--- a/src/mem/cache/cache.cc
+++ b/src/mem/cache/cache.cc
@@ -630,57 +630,55 @@ Cache::recvTimingReq(PacketPtr pkt)
         // flag) is not providing writable (it is in Owned rather than
         // the Modified state), we know that there may be other Shared
         // copies in the system; go out and invalidate them all
-        if (pkt->needsWritable() && !pkt->responderHadWritable()) {
-            // an upstream cache that had the line in Owned state
-            // (dirty, but not writable), is responding and thus
-            // transferring the dirty line from one branch of the
-            // cache hierarchy to another
-
-            // send out an express snoop and invalidate all other
-            // copies (snooping a packet that needs writable is the
-            // same as an invalidation), thus turning the Owned line
-            // into a Modified line, note that we don't invalidate the
-            // block in the current cache or any other cache on the
-            // path to memory
-
-            // create a downstream express snoop with cleared packet
-            // flags, there is no need to allocate any data as the
-            // packet is merely used to co-ordinate state transitions
-            Packet *snoop_pkt = new Packet(pkt, true, false);
-
-            // also reset the bus time that the original packet has
-            // not yet paid for
-            snoop_pkt->headerDelay = snoop_pkt->payloadDelay = 0;
-
-            // make this an instantaneous express snoop, and let the
-            // other caches in the system know that the another cache
-            // is responding, because we have found the authorative
-            // copy (Modified or Owned) that will supply the right
-            // data
-            snoop_pkt->setExpressSnoop();
-            snoop_pkt->setCacheResponding();
-
-            // this express snoop travels towards the memory, and at
-            // every crossbar it is snooped upwards thus reaching
-            // every cache in the system
-            bool M5_VAR_USED success = memSidePort->sendTimingReq(snoop_pkt);
-            // express snoops always succeed
-            assert(success);
-
-            // main memory will delete the snoop packet
-        }
+        assert(pkt->needsWritable() && !pkt->responderHadWritable());
+
+        // an upstream cache that had the line in Owned state
+        // (dirty, but not writable), is responding and thus
+        // transferring the dirty line from one branch of the
+        // cache hierarchy to another
+
+        // send out an express snoop and invalidate all other
+        // copies (snooping a packet that needs writable is the
+        // same as an invalidation), thus turning the Owned line
+        // into a Modified line, note that we don't invalidate the
+        // block in the current cache or any other cache on the
+        // path to memory
+
+        // create a downstream express snoop with cleared packet
+        // flags, there is no need to allocate any data as the
+        // packet is merely used to co-ordinate state transitions
+        Packet *snoop_pkt = new Packet(pkt, true, false);
+
+        // also reset the bus time that the original packet has
+        // not yet paid for
+        snoop_pkt->headerDelay = snoop_pkt->payloadDelay = 0;
+
+        // make this an instantaneous express snoop, and let the
+        // other caches in the system know that the another cache
+        // is responding, because we have found the authorative
+        // copy (Modified or Owned) that will supply the right
+        // data
+        snoop_pkt->setExpressSnoop();
+        snoop_pkt->setCacheResponding();
+
+        // this express snoop travels towards the memory, and at
+        // every crossbar it is snooped upwards thus reaching
+        // every cache in the system
+        bool M5_VAR_USED success = memSidePort->sendTimingReq(snoop_pkt);
+        // express snoops always succeed
+        assert(success);
+
+        // main memory will delete the snoop packet
 
         // queue for deletion, as opposed to immediate deletion, as
         // the sending cache is still relying on the packet
         pendingDelete.reset(pkt);
 
-        // no need to take any action in this particular cache as an
-        // upstream cache has already committed to responding, and
-        // either the packet does not need writable (and we can let
-        // the cache that set the cache responding flag pass on the
-        // line without any need for intervention), or if the packet
-        // needs writable it is provided, or we have already sent out
-        // any express snoops in the section above
+        // no need to take any further action in this particular cache
+        // as an upstram cache has already committed to responding,
+        // and we have already sent out any express snoops in the
+        // section above to ensure all other copies in the system are
+        // invalidated
         return true;
     }
 
@@ -1028,9 +1026,8 @@ Cache::recvAtomic(PacketPtr pkt)
         // if a cache is responding, and it had the line in Owned
         // rather than Modified state, we need to invalidate any
         // copies that are not on the same path to memory
-        if (pkt->needsWritable() && !pkt->responderHadWritable()) {
-            lat += ticksToCycles(memSidePort->sendAtomic(pkt));
-        }
+        assert(pkt->needsWritable() && !pkt->responderHadWritable());
+        lat += ticksToCycles(memSidePort->sendAtomic(pkt));
 
         return lat * clockPeriod();
     }
@@ -2493,11 +2490,8 @@ Cache::CpuSidePort::recvTimingReq(PacketPtr pkt)
 
     bool success = false;
 
-    // always let packets through if an upstream cache has committed
-    // to responding, even if blocked (we should technically look at
-    // the isExpressSnoop flag, but it is set by the cache itself, and
-    // consequently we have to rely on the cacheResponding flag)
-    if (pkt->cacheResponding()) {
+    // always let express snoop packets through if even if blocked
+    if (pkt->isExpressSnoop()) {
         // do not change the current retry state
         bool M5_VAR_USED bypass_success = cache->recvTimingReq(pkt);
         assert(bypass_success);
diff --git a/src/mem/coherent_xbar.cc b/src/mem/coherent_xbar.cc
index 3731bea3f..22642e6f2 100644
--- a/src/mem/coherent_xbar.cc
+++ b/src/mem/coherent_xbar.cc
@@ -56,7 +56,8 @@
 
 CoherentXBar::CoherentXBar(const CoherentXBarParams *p)
     : BaseXBar(p), system(p->system), snoopFilter(p->snoop_filter),
-      snoopResponseLatency(p->snoop_response_latency)
+      snoopResponseLatency(p->snoop_response_latency),
+      pointOfCoherency(p->point_of_coherency)
 {
     // create the ports based on the size of the master and slave
     // vector ports, and the presence of the default port, the ports
@@ -219,32 +220,48 @@ CoherentXBar::recvTimingReq(PacketPtr pkt, PortID slave_port_id)
         pkt->snoopDelay = 0;
     }
 
-    // forwardTiming snooped into peer caches of the sender, and if
-    // this is a clean evict or clean writeback, but the packet is
-    // found in a cache, do not forward it
-    if ((pkt->cmd == MemCmd::CleanEvict ||
-         pkt->cmd == MemCmd::WritebackClean) && pkt->isBlockCached()) {
-        DPRINTF(CoherentXBar, "Clean evict/writeback %#llx still cached, "
-                "not forwarding\n", pkt->getAddr());
-
-        // update the layer state and schedule an idle event
-        reqLayers[master_port_id]->succeededTiming(packetFinishTime);
-
-        // queue the packet for deletion
-        pendingDelete.reset(pkt);
-
-        return true;
-    }
+    // set up a sensible starting point
+    bool success = true;
 
     // remember if the packet will generate a snoop response by
     // checking if a cache set the cacheResponding flag during the
     // snooping above
     const bool expect_snoop_resp = !cache_responding && pkt->cacheResponding();
-    const bool expect_response = pkt->needsResponse() &&
-        !pkt->cacheResponding();
+    bool expect_response = pkt->needsResponse() && !pkt->cacheResponding();
+
+    const bool sink_packet = sinkPacket(pkt);
+
+    // in certain cases the crossbar is responsible for responding
+    bool respond_directly = false;
+
+    if (sink_packet) {
+        DPRINTF(CoherentXBar, "Not forwarding %s to %#llx\n",
+                pkt->cmdString(), pkt->getAddr());
+    } else {
+        // determine if we are forwarding the packet, or responding to
+        // it
+        if (!pointOfCoherency || pkt->isRead() || pkt->isWrite()) {
+            // if we are passing on, rather than sinking, a packet to
+            // which an upstream cache has committed to responding,
+            // the line was needs writable, and the responding only
+            // had an Owned copy, so we need to immidiately let the
+            // downstream caches know, bypass any flow control
+            if (pkt->cacheResponding()) {
+                pkt->setExpressSnoop();
+            }
 
-    // since it is a normal request, attempt to send the packet
-    bool success = masterPorts[master_port_id]->sendTimingReq(pkt);
+            // since it is a normal request, attempt to send the packet
+            success = masterPorts[master_port_id]->sendTimingReq(pkt);
+        } else {
+            // no need to forward, turn this packet around and respond
+            // directly
+            assert(pkt->needsResponse());
+
+            respond_directly = true;
+            assert(!expect_snoop_resp);
+            expect_response = false;
+        }
+    }
 
     if (snoopFilter && !system->bypassCaches()) {
         // Let the snoop filter know about the success of the send operation
@@ -303,6 +320,27 @@ CoherentXBar::recvTimingReq(PacketPtr pkt, PortID slave_port_id)
             snoops++;
     }
 
+    if (sink_packet)
+        // queue the packet for deletion
+        pendingDelete.reset(pkt);
+
+    if (respond_directly) {
+        assert(pkt->needsResponse());
+        assert(success);
+
+        pkt->makeResponse();
+
+        if (snoopFilter && !system->bypassCaches()) {
+            // let the snoop filter inspect the response and update its state
+            snoopFilter->updateResponse(pkt, *slavePorts[slave_port_id]);
+        }
+
+        Tick response_time = clockEdge() + pkt->headerDelay;
+        pkt->headerDelay = 0;
+
+        slavePorts[slave_port_id]->schedTimingResp(pkt, response_time);
+    }
+
     return success;
 }
 
@@ -633,27 +671,35 @@ CoherentXBar::recvAtomic(PacketPtr pkt, PortID slave_port_id)
         snoop_response_latency += snoop_result.second;
     }
 
-    // forwardAtomic snooped into peer caches of the sender, and if
-    // this is a clean evict, but the packet is found in a cache, do
-    // not forward it
-    if ((pkt->cmd == MemCmd::CleanEvict ||
-         pkt->cmd == MemCmd::WritebackClean) && pkt->isBlockCached()) {
-        DPRINTF(CoherentXBar, "Clean evict/writeback %#llx still cached, "
-                "not forwarding\n", pkt->getAddr());
-        return 0;
-    }
+    // set up a sensible default value
+    Tick response_latency = 0;
+
+    const bool sink_packet = sinkPacket(pkt);
 
     // even if we had a snoop response, we must continue and also
     // perform the actual request at the destination
     PortID master_port_id = findPort(pkt->getAddr());
 
+    if (sink_packet) {
+        DPRINTF(CoherentXBar, "Not forwarding %s to %#llx\n",
+                pkt->cmdString(), pkt->getAddr());
+    } else {
+        if (!pointOfCoherency || pkt->isRead() || pkt->isWrite()) {
+            // forward the request to the appropriate destination
+            response_latency = masterPorts[master_port_id]->sendAtomic(pkt);
+        } else {
+            // if it does not need a response we sink the packet above
+            assert(pkt->needsResponse());
+
+            pkt->makeResponse();
+        }
+    }
+
     // stats updates for the request
     pktCount[slave_port_id][master_port_id]++;
     pktSize[slave_port_id][master_port_id] += pkt_size;
     transDist[pkt_cmd]++;
 
-    // forward the request to the appropriate destination
-    Tick response_latency = masterPorts[master_port_id]->sendAtomic(pkt);
 
     // if lower levels have replied, tell the snoop filter
     if (!system->bypassCaches() && snoopFilter && pkt->isResponse()) {
@@ -877,6 +923,30 @@ CoherentXBar::forwardFunctional(PacketPtr pkt, PortID exclude_slave_port_id)
     }
 }
 
+bool
+CoherentXBar::sinkPacket(const PacketPtr pkt) const
+{
+    // we can sink the packet if:
+    // 1) the crossbar is the point of coherency, and a cache is
+    //    responding after being snooped
+    // 2) the crossbar is the point of coherency, and the packet is a
+    //    coherency packet (not a read or a write) that does not
+    //    require a response
+    // 3) this is a clean evict or clean writeback, but the packet is
+    //    found in a cache above this crossbar
+    // 4) a cache is responding after being snooped, and the packet
+    //    either does not need the block to be writable, or the cache
+    //    that has promised to respond (setting the cache responding
+    //    flag) is providing writable and thus had a Modified block,
+    //    and no further action is needed
+    return (pointOfCoherency && pkt->cacheResponding()) ||
+        (pointOfCoherency && !(pkt->isRead() || pkt->isWrite()) &&
+         !pkt->needsResponse()) ||
+        (pkt->isCleanEviction() && pkt->isBlockCached()) ||
+        (pkt->cacheResponding() &&
+         (!pkt->needsWritable() || pkt->responderHadWritable()));
+}
+
 void
 CoherentXBar::regStats()
 {
diff --git a/src/mem/coherent_xbar.hh b/src/mem/coherent_xbar.hh
index 24d00628b..8c55b59da 100644
--- a/src/mem/coherent_xbar.hh
+++ b/src/mem/coherent_xbar.hh
@@ -273,6 +273,9 @@ class CoherentXBar : public BaseXBar
     /** Cycles of snoop response latency.*/
     const Cycles snoopResponseLatency;
 
+    /** Is this crossbar the point of coherency? **/
+    const bool pointOfCoherency;
+
     /**
      * Upstream caches need this packet until true is returned, so
      * hold it for deletion until a subsequent call
@@ -384,6 +387,12 @@ class CoherentXBar : public BaseXBar
      */
     void forwardFunctional(PacketPtr pkt, PortID exclude_slave_port_id);
 
+    /**
+     * Determine if the crossbar should sink the packet, as opposed to
+     * forwarding it, or responding.
+     */
+    bool sinkPacket(const PacketPtr pkt) const;
+
     Stats::Scalar snoops;
     Stats::Distribution snoopFanout;
 
diff --git a/src/mem/dram_ctrl.cc b/src/mem/dram_ctrl.cc
index c7ad3b448..55e1cf805 100644
--- a/src/mem/dram_ctrl.cc
+++ b/src/mem/dram_ctrl.cc
@@ -273,11 +273,14 @@ DRAMCtrl::recvAtomic(PacketPtr pkt)
 {
     DPRINTF(DRAM, "recvAtomic: %s 0x%x\n", pkt->cmdString(), pkt->getAddr());
 
+    panic_if(pkt->cacheResponding(), "Should not see packets where cache "
+             "is responding");
+
     // do the actual memory access and turn the packet into a response
     access(pkt);
 
     Tick latency = 0;
-    if (!pkt->cacheResponding() && pkt->hasData()) {
+    if (pkt->hasData()) {
         // this value is not supposed to be accurate, just enough to
         // keep things going, mimic a closed page
         latency = tRP + tRCD + tCL;
@@ -590,11 +593,11 @@ DRAMCtrl::recvTimingReq(PacketPtr pkt)
     DPRINTF(DRAM, "recvTimingReq: request %s addr %lld size %d\n",
             pkt->cmdString(), pkt->getAddr(), pkt->getSize());
 
-    // if a cache is responding, sink the packet without further action
-    if (pkt->cacheResponding()) {
-        pendingDelete.reset(pkt);
-        return true;
-    }
+    panic_if(pkt->cacheResponding(), "Should not see packets where cache "
+             "is responding");
+
+    panic_if(!(pkt->isRead() || pkt->isWrite()),
+             "Should only see read and writes at memory controller\n");
 
     // Calc avg gap between requests
     if (prevArrival != 0) {
@@ -625,7 +628,8 @@ DRAMCtrl::recvTimingReq(PacketPtr pkt)
             readReqs++;
             bytesReadSys += size;
         }
-    } else if (pkt->isWrite()) {
+    } else {
+        assert(pkt->isWrite());
         assert(size != 0);
         if (writeQueueFull(dram_pkt_count)) {
             DPRINTF(DRAM, "Write queue full, not accepting\n");
@@ -638,10 +642,6 @@ DRAMCtrl::recvTimingReq(PacketPtr pkt)
             writeReqs++;
             bytesWrittenSys += size;
         }
-    } else {
-        DPRINTF(DRAM,"Neither read nor write, ignore timing\n");
-        neitherReadNorWrite++;
-        accessAndRespond(pkt, 1);
     }
 
     return true;
diff --git a/src/mem/simple_mem.cc b/src/mem/simple_mem.cc
index 8760c2bef..bb44b8c85 100644
--- a/src/mem/simple_mem.cc
+++ b/src/mem/simple_mem.cc
@@ -72,8 +72,11 @@ SimpleMemory::init()
 Tick
 SimpleMemory::recvAtomic(PacketPtr pkt)
 {
+    panic_if(pkt->cacheResponding(), "Should not see packets where cache "
+             "is responding");
+
     access(pkt);
-    return pkt->cacheResponding() ? 0 : getLatency();
+    return getLatency();
 }
 
 void
@@ -97,11 +100,12 @@ SimpleMemory::recvFunctional(PacketPtr pkt)
 bool
 SimpleMemory::recvTimingReq(PacketPtr pkt)
 {
-    // if a cache is responding, sink the packet without further action
-    if (pkt->cacheResponding()) {
-        pendingDelete.reset(pkt);
-        return true;
-    }
+    panic_if(pkt->cacheResponding(), "Should not see packets where cache "
+             "is responding");
+
+    panic_if(!(pkt->isRead() || pkt->isWrite()),
+             "Should only see read and writes at memory controller, "
+             "saw %s to %#llx\n", pkt->cmdString(), pkt->getAddr());
 
     // we should not get a new request after committing to retry the
     // current one, but unfortunately the CPU violates this rule, so
@@ -127,21 +131,16 @@ SimpleMemory::recvTimingReq(PacketPtr pkt)
     // rather than long term as it is the short term data rate that is
     // limited for any real memory
 
-    // only look at reads and writes when determining if we are busy,
-    // and for how long, as it is not clear what to regulate for the
-    // other types of commands
-    if (pkt->isRead() || pkt->isWrite()) {
-        // calculate an appropriate tick to release to not exceed
-        // the bandwidth limit
-        Tick duration = pkt->getSize() * bandwidth;
-
-        // only consider ourselves busy if there is any need to wait
-        // to avoid extra events being scheduled for (infinitely) fast
-        // memories
-        if (duration != 0) {
-            schedule(releaseEvent, curTick() + duration);
-            isBusy = true;
-        }
+    // calculate an appropriate tick to release to not exceed
+    // the bandwidth limit
+    Tick duration = pkt->getSize() * bandwidth;
+
+    // only consider ourselves busy if there is any need to wait
+    // to avoid extra events being scheduled for (infinitely) fast
+    // memories
+    if (duration != 0) {
+        schedule(releaseEvent, curTick() + duration);
+        isBusy = true;
     }
 
     // go ahead and deal with the packet and put the response in the