1 files changed, 1101 insertions, 982 deletions
diff --git a/src/mem/cache/cache_impl.hh b/src/mem/cache/cache_impl.hh
index 4cd5ab004..d144266ed 100644
--- a/src/mem/cache/cache_impl.hh
+++ b/src/mem/cache/cache_impl.hh
@@ -28,6 +28,8 @@
  * Authors: Erik Hallnor
  *          Dave Greene
  *          Nathan Binkert
+ *          Steve Reinhardt
+ *          Ron Dreslinski
  */
 
 /**
@@ -35,17 +37,8 @@
  * Cache definitions.
  */
 
-#include <assert.h>
-#include <math.h>
-
-#include <cassert>
-#include <iostream>
-#include <cstring>
-#include <string>
-
 #include "sim/host.hh"
 #include "base/misc.hh"
-#include "cpu/smt.hh"
 
 #include "mem/cache/cache.hh"
 #include "mem/cache/cache_blk.hh"
@@ -54,557 +47,375 @@
 
 #include "sim/sim_exit.hh" // for SimExitEvent
 
-bool SIGNAL_NACK_HACK;
-
-template<class TagStore, class Coherence>
-void
-Cache<TagStore,Coherence>::
-recvStatusChange(Port::Status status, bool isCpuSide)
-{
 
-}
-
-
-template<class TagStore, class Coherence>
-Cache<TagStore,Coherence>::
-Cache(const std::string &_name,
-      Cache<TagStore,Coherence>::Params &params)
+template<class TagStore>
+Cache<TagStore>::Cache(const std::string &_name,
+                       Cache<TagStore>::Params &params)
     : BaseCache(_name, params.baseParams),
       prefetchAccess(params.prefetchAccess),
-      tags(params.tags), missQueue(params.missQueue),
-      coherence(params.coherence), prefetcher(params.prefetcher),
-      hitLatency(params.hitLatency),
-      compressionAlg(params.compressionAlg),
-      blkSize(params.blkSize),
+      tags(params.tags),
+      prefetcher(params.prefetcher),
       doFastWrites(params.doFastWrites),
-      prefetchMiss(params.prefetchMiss),
-      storeCompressed(params.storeCompressed),
-      compressOnWriteback(params.compressOnWriteback),
-      compLatency(params.compLatency),
-      adaptiveCompression(params.adaptiveCompression),
-      writebackCompressed(params.writebackCompressed)
+      prefetchMiss(params.prefetchMiss)
 {
+    tempBlock = new BlkType();
+    tempBlock->data = new uint8_t[blkSize];
+
+    cpuSidePort = new CpuSidePort(_name + "-cpu_side_port", this);
+    memSidePort = new MemSidePort(_name + "-mem_side_port", this);
+    cpuSidePort->setOtherPort(memSidePort);
+    memSidePort->setOtherPort(cpuSidePort);
+
     tags->setCache(this);
-    missQueue->setCache(this);
-    missQueue->setPrefetcher(prefetcher);
-    coherence->setCache(this);
     prefetcher->setCache(this);
-    invalidateReq = new Request((Addr) NULL, blkSize, 0);
-    invalidatePkt = new Packet(invalidateReq, MemCmd::InvalidateReq, 0);
 }
 
-template<class TagStore, class Coherence>
+template<class TagStore>
 void
-Cache<TagStore,Coherence>::regStats()
+Cache<TagStore>::regStats()
 {
     BaseCache::regStats();
     tags->regStats(name());
-    missQueue->regStats(name());
-    coherence->regStats(name());
     prefetcher->regStats(name());
 }
 
-template<class TagStore, class Coherence>
-typename Cache<TagStore,Coherence>::BlkType*
-Cache<TagStore,Coherence>::handleAccess(PacketPtr &pkt, int & lat,
-                                        PacketList & writebacks, bool update)
+template<class TagStore>
+Port *
+Cache<TagStore>::getPort(const std::string &if_name, int idx)
 {
-    // Set the block offset here
-    int offset = tags->extractBlkOffset(pkt->getAddr());
-
-    BlkType *blk = NULL;
-    if (update) {
-        blk = tags->findBlock(pkt->getAddr(), lat);
+    if (if_name == "" || if_name == "cpu_side") {
+        return cpuSidePort;
+    } else if (if_name == "mem_side") {
+        return memSidePort;
+    } else if (if_name == "functional") {
+        return new CpuSidePort(name() + "-cpu_side_funcport", this);
     } else {
-        blk = tags->findBlock(pkt->getAddr());
-        lat = 0;
+        panic("Port name %s unrecognized\n", if_name);
     }
-    if (blk != NULL) {
-
-        if (!update) {
-
-            if (pkt->isWrite()){
-                assert(offset < blkSize);
-                assert(pkt->getSize() <= blkSize);
-                assert(offset+pkt->getSize() <= blkSize);
-                std::memcpy(blk->data + offset, pkt->getPtr<uint8_t>(),
-                       pkt->getSize());
-            } else if (pkt->isReadWrite()) {
-                cmpAndSwap(blk, pkt);
-            } else if (!(pkt->flags & SATISFIED)) {
-                pkt->flags |= SATISFIED;
-                pkt->result = Packet::Success;
-                assert(offset < blkSize);
-                assert(pkt->getSize() <= blkSize);
-                assert(offset + pkt->getSize() <=blkSize);
-                std::memcpy(pkt->getPtr<uint8_t>(), blk->data + offset,
-                       pkt->getSize());
-            }
-            return blk;
-        }
+}
 
-        // Hit
-        if (blk->isPrefetch()) {
-            //Signal that this was a hit under prefetch (no need for
-            //use prefetch (only can get here if true)
-            DPRINTF(HWPrefetch, "Hit a block that was prefetched\n");
-            blk->status &= ~BlkHWPrefetched;
-            if (prefetchMiss) {
-                //If we are using the miss stream, signal the
-                //prefetcher otherwise the access stream would have
-                //already signaled this hit
-                prefetcher->handleMiss(pkt, curTick);
-            }
-        }
+template<class TagStore>
+void
+Cache<TagStore>::deletePortRefs(Port *p)
+{
+    if (cpuSidePort == p || memSidePort == p)
+        panic("Can only delete functional ports\n");
 
-        if ((pkt->isReadWrite() && blk->isWritable()) ||
-            (pkt->isWrite() && blk->isWritable()) ||
-            (pkt->isRead() && blk->isValid())) {
+    delete p;
+}
 
-            // We are satisfying the request
-            pkt->flags |= SATISFIED;
 
-            if (blk->isCompressed()) {
-                // If the data is compressed, need to increase the latency
-                lat += (compLatency/4);
-            }
+template<class TagStore>
+void
+Cache<TagStore>::cmpAndSwap(BlkType *blk, PacketPtr pkt)
+{
+    uint64_t overwrite_val;
+    bool overwrite_mem;
+    uint64_t condition_val64;
+    uint32_t condition_val32;
 
-            bool write_data = false;
+    int offset = tags->extractBlkOffset(pkt->getAddr());
+    uint8_t *blk_data = blk->data + offset;
+
+    assert(sizeof(uint64_t) >= pkt->getSize());
+
+    overwrite_mem = true;
+    // keep a copy of our possible write value, and copy what is at the
+    // memory address into the packet
+    pkt->writeData((uint8_t *)&overwrite_val);
+    pkt->setData(blk_data);
+
+    if (pkt->req->isCondSwap()) {
+        if (pkt->getSize() == sizeof(uint64_t)) {
+            condition_val64 = pkt->req->getExtraData();
+            overwrite_mem = !std::memcmp(&condition_val64, blk_data,
+                                         sizeof(uint64_t));
+        } else if (pkt->getSize() == sizeof(uint32_t)) {
+            condition_val32 = (uint32_t)pkt->req->getExtraData();
+            overwrite_mem = !std::memcmp(&condition_val32, blk_data,
+                                         sizeof(uint32_t));
+        } else
+            panic("Invalid size for conditional read/write\n");
+    }
 
-            assert(verifyData(blk));
+    if (overwrite_mem)
+        std::memcpy(blk_data, &overwrite_val, pkt->getSize());
+}
 
-            assert(offset < blkSize);
-            assert(pkt->getSize() <= blkSize);
-            assert(offset+pkt->getSize() <= blkSize);
 
-            if (pkt->isWrite()) {
-                if (blk->checkWrite(pkt->req)) {
-                    write_data = true;
-                    blk->status |= BlkDirty;
-                    std::memcpy(blk->data + offset, pkt->getPtr<uint8_t>(),
-                           pkt->getSize());
-                }
-            } else if (pkt->isReadWrite()) {
-                cmpAndSwap(blk, pkt);
+template<class TagStore>
+void
+Cache<TagStore>::satisfyCpuSideRequest(PacketPtr pkt, BlkType *blk)
+{
+    assert(blk);
+    // Occasionally this is not true... if we are a lower-level cache
+    // satisfying a string of Read and ReadEx requests from
+    // upper-level caches, a Read will mark the block as shared but we
+    // can satisfy a following ReadEx anyway since we can rely on the
+    // Read requester(s) to have buffered the ReadEx snoop and to
+    // invalidate their blocks after receiving them.
+    // assert(pkt->needsExclusive() ? blk->isWritable() : blk->isValid());
+    assert(pkt->getOffset(blkSize) + pkt->getSize() <= blkSize);
+
+    // Check RMW operations first since both isRead() and
+    // isWrite() will be true for them
+    if (pkt->cmd == MemCmd::SwapReq) {
+        cmpAndSwap(blk, pkt);
+    } else if (pkt->isWrite()) {
+        if (blk->checkWrite(pkt)) {
+            blk->status |= BlkDirty;
+            pkt->writeDataToBlock(blk->data, blkSize);
+        }
+    } else if (pkt->isRead()) {
+        if (pkt->isLocked()) {
+            blk->trackLoadLocked(pkt);
+        }
+        pkt->setDataFromBlock(blk->data, blkSize);
+        if (pkt->getSize() == blkSize) {
+            // special handling for coherent block requests from
+            // upper-level caches
+            if (pkt->needsExclusive()) {
+                // on ReadExReq we give up our copy
+                tags->invalidateBlk(blk);
             } else {
-                assert(pkt->isRead());
-                if (pkt->req->isLocked()) {
-                    blk->trackLoadLocked(pkt->req);
-                }
-                std::memcpy(pkt->getPtr<uint8_t>(), blk->data + offset,
-                            pkt->getSize());
-            }
-
-            if (write_data ||
-                (adaptiveCompression && blk->isCompressed()))
-            {
-                // If we wrote data, need to update the internal block
-                // data.
-                updateData(blk, writebacks,
-                           !(adaptiveCompression &&
-                             blk->isReferenced()));
+                // on ReadReq we create shareable copies here and in
+                // the requester
+                pkt->assertShared();
+                blk->status &= ~BlkWritable;
             }
-        } else {
-            // permission violation, treat it as a miss
-            blk = NULL;
         }
     } else {
-        // complete miss (no matching block)
-        if (pkt->req->isLocked() && pkt->isWrite()) {
-            // miss on store conditional... just give up now
-            pkt->req->setExtraData(0);
-            pkt->flags |= SATISFIED;
-        }
+        // Not a read or write... must be an upgrade.  it's OK
+        // to just ack those as long as we have an exclusive
+        // copy at this level.
+        assert(pkt->cmd == MemCmd::UpgradeReq);
+        tags->invalidateBlk(blk);
     }
-
-    return blk;
 }
 
-template<class TagStore, class Coherence>
-void
-Cache<TagStore,Coherence>::cmpAndSwap(BlkType *blk, PacketPtr &pkt){
-            uint64_t overwrite_val;
-            bool overwrite_mem;
-            uint64_t condition_val64;
-            uint32_t condition_val32;
-
-            int offset = tags->extractBlkOffset(pkt->getAddr());
-
-            assert(sizeof(uint64_t) >= pkt->getSize());
-
-            overwrite_mem = true;
-            // keep a copy of our possible write value, and copy what is at the
-            // memory address into the packet
-            std::memcpy(&overwrite_val, pkt->getPtr<uint8_t>(), pkt->getSize());
-            std::memcpy(pkt->getPtr<uint8_t>(), blk->data + offset,
-                        pkt->getSize());
-
-            if (pkt->req->isCondSwap()) {
-                if (pkt->getSize() == sizeof(uint64_t)) {
-                    condition_val64 = pkt->req->getExtraData();
-                    overwrite_mem = !std::memcmp(&condition_val64, blk->data + offset,
-                                                 sizeof(uint64_t));
-                } else if (pkt->getSize() == sizeof(uint32_t)) {
-                    condition_val32 = (uint32_t)pkt->req->getExtraData();
-                    overwrite_mem = !std::memcmp(&condition_val32, blk->data + offset,
-                                                 sizeof(uint32_t));
-                } else
-                    panic("Invalid size for conditional read/write\n");
-            }
 
-            if (overwrite_mem)
-                std::memcpy(blk->data + offset,
-                            &overwrite_val, pkt->getSize());
+/////////////////////////////////////////////////////
+//
+// MSHR helper functions
+//
+/////////////////////////////////////////////////////
 
-}
 
-template<class TagStore, class Coherence>
-typename Cache<TagStore,Coherence>::BlkType*
-Cache<TagStore,Coherence>::handleFill(BlkType *blk, PacketPtr &pkt,
-                                      CacheBlk::State new_state,
-                                      PacketList & writebacks,
-                                      PacketPtr target)
+template<class TagStore>
+void
+Cache<TagStore>::markInService(MSHR *mshr)
 {
-#ifndef NDEBUG
-    BlkType *tmp_blk = tags->findBlock(pkt->getAddr());
-    assert(tmp_blk == blk);
+    markInServiceInternal(mshr);
+#if 0
+        if (mshr->originalCmd == MemCmd::HardPFReq) {
+            DPRINTF(HWPrefetch, "%s:Marking a HW_PF in service\n",
+                    name());
+            //Also clear pending if need be
+            if (!prefetcher->havePending())
+            {
+                deassertMemSideBusRequest(Request_PF);
+            }
+        }
 #endif
-    blk = doReplacement(blk, pkt, new_state, writebacks);
-
-
-    if (pkt->isRead()) {
-        std::memcpy(blk->data, pkt->getPtr<uint8_t>(), blkSize);
-    }
-
-        blk->whenReady = pkt->finishTime;
-
-    // Respond to target, if any
-    if (target) {
+}
 
-        target->flags |= SATISFIED;
 
-        if (target->cmd == MemCmd::InvalidateReq) {
-            tags->invalidateBlk(blk);
-            blk = NULL;
-        }
+template<class TagStore>
+void
+Cache<TagStore>::squash(int threadNum)
+{
+    bool unblock = false;
+    BlockedCause cause = NUM_BLOCKED_CAUSES;
 
-        if (blk && ((target->isWrite() || target->isReadWrite()) ?
-                    blk->isWritable() : blk->isValid())) {
-            assert(target->isWrite() || target->isReadWrite() || target->isRead());
-            assert(target->getOffset(blkSize) + target->getSize() <= blkSize);
-            if (target->isWrite()) {
-                if (blk->checkWrite(pkt->req)) {
-                    blk->status |= BlkDirty;
-                    std::memcpy(blk->data + target->getOffset(blkSize),
-                           target->getPtr<uint8_t>(), target->getSize());
-                }
-            } else if (target->isReadWrite()) {
-                cmpAndSwap(blk, target);
-            } else {
-                if (pkt->req->isLocked()) {
-                    blk->trackLoadLocked(pkt->req);
-                }
-                std::memcpy(target->getPtr<uint8_t>(),
-                       blk->data + target->getOffset(blkSize),
-                       target->getSize());
-            }
-        }
+    if (noTargetMSHR && noTargetMSHR->threadNum == threadNum) {
+        noTargetMSHR = NULL;
+        unblock = true;
+        cause = Blocked_NoTargets;
     }
-
-    if (blk) {
-        // Need to write the data into the block
-        updateData(blk, writebacks, !adaptiveCompression || true);
+    if (mshrQueue.isFull()) {
+        unblock = true;
+        cause = Blocked_NoMSHRs;
+    }
+    mshrQueue.squash(threadNum);
+    if (unblock && !mshrQueue.isFull()) {
+        clearBlocked(cause);
     }
-    return blk;
 }
 
-template<class TagStore, class Coherence>
-typename Cache<TagStore,Coherence>::BlkType*
-Cache<TagStore,Coherence>::handleFill(BlkType *blk, MSHR * mshr,
-                                      CacheBlk::State new_state,
-                                      PacketList & writebacks, PacketPtr pkt)
-{
-/*
-#ifndef NDEBUG
-    BlkType *tmp_blk = findBlock(mshr->pkt->getAddr());
-    assert(tmp_blk == blk);
-#endif
-    PacketPtr pkt = mshr->pkt;*/
-    blk = doReplacement(blk, pkt, new_state, writebacks);
+/////////////////////////////////////////////////////
+//
+// Access path: requests coming in from the CPU side
+//
+/////////////////////////////////////////////////////
 
-    if (pkt->isRead()) {
-        std::memcpy(blk->data, pkt->getPtr<uint8_t>(), blkSize);
+template<class TagStore>
+bool
+Cache<TagStore>::access(PacketPtr pkt, BlkType *&blk, int &lat)
+{
+    if (pkt->req->isUncacheable())  {
+        blk = NULL;
+        lat = hitLatency;
+        return false;
     }
 
-    blk->whenReady = pkt->finishTime;
-
-
-    // respond to MSHR targets, if any
+    bool satisfied = false;  // assume the worst
+    blk = tags->findBlock(pkt->getAddr(), lat);
 
-    // First offset for critical word first calculations
-    int initial_offset = 0;
-
-    if (mshr->hasTargets()) {
-        initial_offset = mshr->getTarget()->getOffset(blkSize);
+    if (prefetchAccess) {
+        //We are determining prefetches on access stream, call prefetcher
+        prefetcher->handleMiss(pkt, curTick);
     }
 
-    while (mshr->hasTargets()) {
-        PacketPtr target = mshr->getTarget();
-
-        target->flags |= SATISFIED;
-
-        // How many bytes pass the first request is this one
-        int transfer_offset = target->getOffset(blkSize) - initial_offset;
-        if (transfer_offset < 0) {
-            transfer_offset += blkSize;
-        }
-
-        // If critical word (no offset) return first word time
-        Tick completion_time = tags->getHitLatency() +
-            transfer_offset ? pkt->finishTime : pkt->firstWordTime;
+    DPRINTF(Cache, "%s %x %s\n", pkt->cmdString(), pkt->getAddr(),
+            (blk) ? "hit" : "miss");
 
-        if (target->cmd == MemCmd::InvalidateReq) {
-            //Mark the blk as invalid now, if it hasn't been already
-            if (blk) {
-                tags->invalidateBlk(blk);
-                blk = NULL;
+    if (blk != NULL) {
+        // HIT
+        if (blk->isPrefetch()) {
+            //Signal that this was a hit under prefetch (no need for
+            //use prefetch (only can get here if true)
+            DPRINTF(HWPrefetch, "Hit a block that was prefetched\n");
+            blk->status &= ~BlkHWPrefetched;
+            if (prefetchMiss) {
+                //If we are using the miss stream, signal the
+                //prefetcher otherwise the access stream would have
+                //already signaled this hit
+                prefetcher->handleMiss(pkt, curTick);
             }
-
-            //Also get rid of the invalidate
-            mshr->popTarget();
-
-            DPRINTF(Cache, "Popping off a Invalidate for addr %x\n",
-                    pkt->getAddr());
-
-            continue;
         }
 
-        if (blk && ((target->isWrite() || target->isReadWrite()) ?
-            blk->isWritable() : blk->isValid())) {
-            assert(target->isWrite() || target->isRead() || target->isReadWrite() );
-            assert(target->getOffset(blkSize) + target->getSize() <= blkSize);
-            if (target->isWrite()) {
-                if (blk->checkWrite(pkt->req)) {
-                    blk->status |= BlkDirty;
-                    std::memcpy(blk->data + target->getOffset(blkSize),
-                           target->getPtr<uint8_t>(), target->getSize());
-                }
-            } else if (target->isReadWrite()) {
-                cmpAndSwap(blk, target);
-            } else {
-                if (target->req->isLocked()) {
-                    blk->trackLoadLocked(target->req);
-                }
-                std::memcpy(target->getPtr<uint8_t>(),
-                       blk->data + target->getOffset(blkSize),
-                       target->getSize());
-            }
+        if (pkt->needsExclusive() ? blk->isWritable() : blk->isValid()) {
+            // OK to satisfy access
+            hits[pkt->cmdToIndex()][0/*pkt->req->getThreadNum()*/]++;
+            satisfied = true;
+            satisfyCpuSideRequest(pkt, blk);
+        } else if (pkt->cmd == MemCmd::Writeback) {
+            // special case: writeback to read-only block (e.g., from
+            // L1 into L2).  since we're really just passing ownership
+            // from one cache to another, we can update this cache to
+            // be the owner without making the block writeable
+            assert(!blk->isWritable() /* && !blk->isDirty() */);
+            assert(blkSize == pkt->getSize());
+            std::memcpy(blk->data, pkt->getPtr<uint8_t>(), blkSize);
+            blk->status |= BlkDirty;
+            satisfied = true;
+            // nothing else to do; writeback doesn't expect response
+            assert(!pkt->needsResponse());
         } else {
-            // Invalid access, need to do another request
-            // can occur if block is invalidated, or not correct
-            // permissions
-//            mshr->pkt = pkt;
-            break;
+            // permission violation... nothing to do here, leave unsatisfied
+            // for statistics purposes this counts like a complete miss
+            incMissCount(pkt);
         }
-        respondToMiss(target, completion_time);
-        mshr->popTarget();
-    }
+    } else {
+        // complete miss (no matching block)
+        incMissCount(pkt);
 
-    if (blk) {
-        // Need to write the data into the block
-        updateData(blk, writebacks, !adaptiveCompression || true);
+        if (pkt->isLocked() && pkt->isWrite()) {
+            // miss on store conditional... just give up now
+            pkt->req->setExtraData(0);
+            satisfied = true;
+        }
     }
 
-    return blk;
+    return satisfied;
 }
 
 
-template<class TagStore, class Coherence>
-void
-Cache<TagStore,Coherence>::handleSnoop(BlkType *blk,
-                                       CacheBlk::State new_state,
-                                       PacketPtr &pkt)
+class ForwardResponseRecord : public Packet::SenderState
 {
-    //Must have the block to supply
-    assert(blk);
-    // Can only supply data, and if it hasn't already been supllied
-    assert(pkt->isRead());
-    assert(!(pkt->flags & SATISFIED));
-    pkt->flags |= SATISFIED;
-    Addr offset = pkt->getOffset(blkSize);
-    assert(offset < blkSize);
-    assert(pkt->getSize() <= blkSize);
-    assert(offset + pkt->getSize() <=blkSize);
-    std::memcpy(pkt->getPtr<uint8_t>(), blk->data + offset, pkt->getSize());
-
-    handleSnoop(blk, new_state);
-}
-
-template<class TagStore, class Coherence>
-void
-Cache<TagStore,Coherence>::handleSnoop(BlkType *blk,
-                                       CacheBlk::State new_state)
-{
-    if (blk && blk->status != new_state) {
-        if ((new_state && BlkValid) == 0) {
-            tags->invalidateBlk(blk);
-        } else {
-            assert(new_state >= 0 && new_state < 128);
-            blk->status = new_state;
-        }
-    }
-}
-
-template<class TagStore, class Coherence>
-PacketPtr
-Cache<TagStore,Coherence>::writebackBlk(BlkType *blk)
-{
-    assert(blk && blk->isValid() && blk->isModified());
-    int data_size = blkSize;
-    data_size = blk->size;
-    if (compressOnWriteback) {
-        // not already compressed
-        // need to compress to ship it
-        assert(data_size == blkSize);
-        uint8_t *tmp_data = new uint8_t[blkSize];
-        data_size = compressionAlg->compress(tmp_data,blk->data,
-                                      data_size);
-        delete [] tmp_data;
+    Packet::SenderState *prevSenderState;
+    int prevSrc;
+#ifndef NDEBUG
+    BaseCache *cache;
+#endif
+  public:
+    ForwardResponseRecord(Packet *pkt, BaseCache *_cache)
+        : prevSenderState(pkt->senderState), prevSrc(pkt->getSrc())
+#ifndef NDEBUG
+          , cache(_cache)
+#endif
+    {}
+    void restore(Packet *pkt, BaseCache *_cache)
+    {
+        assert(_cache == cache);
+        pkt->senderState = prevSenderState;
+        pkt->setDest(prevSrc);
     }
+};
 
-/*    PacketPtr writeback =
-        buildWritebackReq(tags->regenerateBlkAddr(blk->tag, blk->set),
-                          blk->asid, blkSize,
-                          blk->data, data_size);
-*/
-
-    Request *writebackReq =
-        new Request(tags->regenerateBlkAddr(blk->tag, blk->set), blkSize, 0);
-    PacketPtr writeback = new Packet(writebackReq, MemCmd::Writeback, -1);
-    writeback->allocate();
-    std::memcpy(writeback->getPtr<uint8_t>(),blk->data,blkSize);
-
-    blk->status &= ~BlkDirty;
-    return writeback;
-}
 
-
-template<class TagStore, class Coherence>
+template<class TagStore>
 bool
-Cache<TagStore,Coherence>::verifyData(BlkType *blk)
+Cache<TagStore>::timingAccess(PacketPtr pkt)
 {
-    bool retval;
-    // The data stored in the blk
-    uint8_t *blk_data = new uint8_t[blkSize];
-    tags->readData(blk, blk_data);
-    // Pointer for uncompressed data, assumed uncompressed
-    uint8_t *tmp_data = blk_data;
-    // The size of the data being stored, assumed uncompressed
-    int data_size = blkSize;
-
-    // If the block is compressed need to uncompress to access
-    if (blk->isCompressed()){
-        // Allocate new storage for the data
-        tmp_data = new uint8_t[blkSize];
-        data_size = compressionAlg->uncompress(tmp_data,blk_data, blk->size);
-        assert(data_size == blkSize);
-        // Don't need to keep blk_data around
-        delete [] blk_data;
-    } else {
-        assert(blkSize == blk->size);
-    }
-
-    retval = std::memcmp(tmp_data, blk->data, blkSize) == 0;
-    delete [] tmp_data;
-    return retval;
-}
+//@todo Add back in MemDebug Calls
+//    MemDebug::cacheAccess(pkt);
 
-template<class TagStore, class Coherence>
-void
-Cache<TagStore,Coherence>::updateData(BlkType *blk, PacketList &writebacks,
-                                        bool compress_block)
-{
-    if (storeCompressed && compress_block) {
-        uint8_t *comp_data = new uint8_t[blkSize];
-        int new_size = compressionAlg->compress(comp_data, blk->data, blkSize);
-        if (new_size > (blkSize - tags->getSubBlockSize())){
-            // no benefit to storing it compressed
-            blk->status &= ~BlkCompressed;
-            tags->writeData(blk, blk->data, blkSize,
-                          writebacks);
-        } else {
-            // Store the data compressed
-            blk->status |= BlkCompressed;
-            tags->writeData(blk, comp_data, new_size,
-                          writebacks);
-        }
-        delete [] comp_data;
-    } else {
-        blk->status &= ~BlkCompressed;
-        tags->writeData(blk, blk->data, blkSize, writebacks);
+    // we charge hitLatency for doing just about anything here
+    Tick time =  curTick + hitLatency;
+
+    if (pkt->isResponse()) {
+        // must be cache-to-cache response from upper to lower level
+        ForwardResponseRecord *rec =
+            dynamic_cast<ForwardResponseRecord *>(pkt->senderState);
+        assert(rec != NULL);
+        rec->restore(pkt, this);
+        delete rec;
+        memSidePort->respond(pkt, time);
+        return true;
     }
-}
 
-template<class TagStore, class Coherence>
-typename Cache<TagStore,Coherence>::BlkType*
-Cache<TagStore,Coherence>::doReplacement(BlkType *blk, PacketPtr &pkt,
-                                         CacheBlk::State new_state,
-                                         PacketList &writebacks)
-{
-    if (blk == NULL) {
-        // need to do a replacement
-        BlkList compress_list;
-        blk = tags->findReplacement(pkt, writebacks, compress_list);
-        while (adaptiveCompression && !compress_list.empty()) {
-            updateData(compress_list.front(), writebacks, true);
-            compress_list.pop_front();
+    assert(pkt->isRequest());
+
+    if (pkt->memInhibitAsserted()) {
+        DPRINTF(Cache, "mem inhibited on 0x%x: not responding\n",
+                pkt->getAddr());
+        assert(!pkt->req->isUncacheable());
+        // Special tweak for multilevel coherence: snoop downward here
+        // on invalidates since there may be other caches below here
+        // that have shared copies.  Not necessary if we know that
+        // supplier had exclusive copy to begin with.
+        if (pkt->needsExclusive() && !pkt->isSupplyExclusive()) {
+            Packet *snoopPkt = new Packet(pkt, true);  // clear flags
+            snoopPkt->setExpressSnoop();
+            snoopPkt->assertMemInhibit();
+            memSidePort->sendTiming(snoopPkt);
+            // main memory will delete snoopPkt
         }
-        if (blk->isValid()) {
-            DPRINTF(Cache, "replacement: replacing %x with %x: %s\n",
-                    tags->regenerateBlkAddr(blk->tag,blk->set), pkt->getAddr(),
-                    (blk->isModified()) ? "writeback" : "clean");
+        return true;
+    }
 
-            if (blk->isModified()) {
-                // Need to write the data back
-                writebacks.push_back(writebackBlk(blk));
-            }
+    if (pkt->req->isUncacheable()) {
+        // writes go in write buffer, reads use MSHR
+        if (pkt->isWrite() && !pkt->isRead()) {
+            allocateWriteBuffer(pkt, time, true);
+        } else {
+            allocateUncachedReadBuffer(pkt, time, true);
         }
-        blk->tag = tags->extractTag(pkt->getAddr(), blk);
-    } else {
-        // must be a status change
-        // assert(blk->status != new_state);
-        if (blk->status == new_state) warn("Changing state to same value\n");
+        assert(pkt->needsResponse()); // else we should delete it here??
+        return true;
     }
 
-    blk->status = new_state;
-    return blk;
-}
-
-
-template<class TagStore, class Coherence>
-bool
-Cache<TagStore,Coherence>::access(PacketPtr &pkt)
-{
-//@todo Add back in MemDebug Calls
-//    MemDebug::cacheAccess(pkt);
-    BlkType *blk = NULL;
-    PacketList writebacks;
-    int size = blkSize;
     int lat = hitLatency;
-    if (prefetchAccess) {
-        //We are determining prefetches on access stream, call prefetcher
-        prefetcher->handleMiss(pkt, curTick);
-    }
+    bool satisfied = false;
 
     Addr blk_addr = pkt->getAddr() & ~(Addr(blkSize-1));
-
-    if (!pkt->req->isUncacheable()) {
-        if (!missQueue->findMSHR(blk_addr)) {
-            blk = handleAccess(pkt, lat, writebacks);
-        }
-    } else {
-        size = pkt->getSize();
+    MSHR *mshr = mshrQueue.findMatch(blk_addr);
+
+    if (!mshr) {
+        // no outstanding access to this block, look up in cache
+        // (otherwise if we allow reads while there's an outstanding
+        // write miss, the read could return stale data out of the
+        // cache block... a more aggressive system could detect the
+        // overlap (if any) and forward data out of the MSHRs, but we
+        // don't do that yet)
+        BlkType *blk = NULL;
+        satisfied = access(pkt, blk, lat);
     }
+
+#if 0
+    PacketList writebacks;
+
     // If this is a block size write/hint (WH64) allocate the block here
     // if the coherence protocol allows it.
     /** @todo make the fast write alloc (wh64) work with coherence. */
@@ -613,7 +424,7 @@ Cache<TagStore,Coherence>::access(PacketPtr &pkt)
         (pkt->cmd == MemCmd::WriteReq
          || pkt->cmd == MemCmd::WriteInvalidateReq) ) {
         // not outstanding misses, can do this
-        MSHR* outstanding_miss = missQueue->findMSHR(pkt->getAddr());
+        MSHR *outstanding_miss = mshrQueue.findMatch(pkt->getAddr());
         if (pkt->cmd == MemCmd::WriteInvalidateReq || !outstanding_miss) {
             if (outstanding_miss) {
                 warn("WriteInv doing a fastallocate"
@@ -624,683 +435,991 @@ Cache<TagStore,Coherence>::access(PacketPtr &pkt)
             ++fastWrites;
         }
     }
+
+    // copy writebacks to write buffer
     while (!writebacks.empty()) {
         PacketPtr wbPkt = writebacks.front();
-        missQueue->doWriteback(wbPkt);
+        allocateWriteBuffer(wbPkt, time, true);
         writebacks.pop_front();
-        delete wbPkt;
-    }
-
-    if (!pkt->req->isUncacheable()) {
-        DPRINTF(Cache, "%s %x %s\n", pkt->cmdString(), pkt->getAddr(),
-                (blk) ? "hit" : "miss");
     }
+#endif
 
-    if (blk) {
-        // Hit
-        hits[pkt->cmdToIndex()][0/*pkt->req->getThreadNum()*/]++;
-        // clear dirty bit if write through
-        respond(pkt, curTick+lat);
-        return true;
-    }
+    bool needsResponse = pkt->needsResponse();
 
-    // Miss
-    if (!pkt->req->isUncacheable()) {
-        misses[pkt->cmdToIndex()][0/*pkt->req->getThreadNum()*/]++;
-        /** @todo Move miss count code into BaseCache */
-        if (missCount) {
-            --missCount;
-            if (missCount == 0)
-                exitSimLoop("A cache reached the maximum miss count");
+    if (satisfied) {
+        if (needsResponse) {
+            pkt->makeTimingResponse();
+            cpuSidePort->respond(pkt, curTick+lat);
+        } else {
+            delete pkt;
         }
-    }
-
-    if (pkt->flags & SATISFIED) {
-        // happens when a store conditional fails because it missed
-        // the cache completely
-        respond(pkt, curTick+lat);
     } else {
-        missQueue->handleMiss(pkt, size, curTick + hitLatency);
-    }
+        // miss
+        if (prefetchMiss)
+            prefetcher->handleMiss(pkt, time);
 
-    if (!pkt->needsResponse()) {
-        //Need to clean up the packet on a writeback miss, but leave the request
-        //for the next level.
-        delete pkt;
+        if (mshr) {
+            // MSHR hit
+            //@todo remove hw_pf here
+            mshr_hits[pkt->cmdToIndex()][0/*pkt->req->getThreadNum()*/]++;
+            if (mshr->threadNum != 0/*pkt->req->getThreadNum()*/) {
+                mshr->threadNum = -1;
+            }
+            mshr->allocateTarget(pkt, time, order++);
+            if (mshr->getNumTargets() == numTarget) {
+                noTargetMSHR = mshr;
+                setBlocked(Blocked_NoTargets);
+                // need to be careful with this... if this mshr isn't
+                // ready yet (i.e. time > curTick_, we don't want to
+                // move it ahead of mshrs that are ready
+                // mshrQueue.moveToFront(mshr);
+            }
+        } else {
+            // no MSHR
+            mshr_misses[pkt->cmdToIndex()][0/*pkt->req->getThreadNum()*/]++;
+            // always mark as cache fill for now... if we implement
+            // no-write-allocate or bypass accesses this will have to
+            // be changed.
+            if (pkt->cmd == MemCmd::Writeback) {
+                allocateWriteBuffer(pkt, time, true);
+            } else {
+                allocateMissBuffer(pkt, time, true);
+            }
+        }
     }
 
     return true;
 }
 
 
-template<class TagStore, class Coherence>
+template<class TagStore>
 PacketPtr
-Cache<TagStore,Coherence>::getPacket()
+Cache<TagStore>::getBusPacket(PacketPtr cpu_pkt, BlkType *blk,
+                              bool needsExclusive)
 {
-    assert(missQueue->havePending());
-    PacketPtr pkt = missQueue->getPacket();
-    if (pkt) {
-        if (!pkt->req->isUncacheable()) {
-            if (pkt->cmd == MemCmd::HardPFReq)
-                misses[MemCmd::HardPFReq][0/*pkt->req->getThreadNum()*/]++;
-            BlkType *blk = tags->findBlock(pkt->getAddr());
-            MemCmd cmd =
-                coherence->getBusCmd(pkt->cmd, (blk) ? blk->status : 0);
-            missQueue->setBusCmd(pkt, cmd);
-        }
+    bool blkValid = blk && blk->isValid();
+
+    if (cpu_pkt->req->isUncacheable()) {
+        assert(blk == NULL);
+        return NULL;
+    }
+
+    if (!blkValid &&
+        (cpu_pkt->cmd == MemCmd::Writeback ||
+         cpu_pkt->cmd == MemCmd::UpgradeReq)) {
+            // For now, writebacks from upper-level caches that
+            // completely miss in the cache just go through. If we had
+            // "fast write" support (where we could write the whole
+            // block w/o fetching new data) we might want to allocate
+            // on writeback misses instead.
+        return NULL;
     }
 
-    assert(!doMasterRequest() || missQueue->havePending());
-    assert(!pkt || pkt->time <= curTick);
-    SIGNAL_NACK_HACK = false;
+    assert(cpu_pkt->needsResponse());
+
+    MemCmd cmd;
+    // @TODO make useUpgrades a parameter.
+    // Note that ownership protocols require upgrade, otherwise a
+    // write miss on a shared owned block will generate a ReadExcl,
+    // which will clobber the owned copy.
+    const bool useUpgrades = true;
+    if (blkValid && useUpgrades) {
+        // only reason to be here is that blk is shared
+        // (read-only) and we need exclusive
+        assert(needsExclusive && !blk->isWritable());
+        cmd = MemCmd::UpgradeReq;
+    } else {
+        // block is invalid
+        cmd = needsExclusive ? MemCmd::ReadExReq : MemCmd::ReadReq;
+    }
+    PacketPtr pkt = new Packet(cpu_pkt->req, cmd, Packet::Broadcast, blkSize);
+
+    pkt->allocate();
     return pkt;
 }
 
-template<class TagStore, class Coherence>
-void
-Cache<TagStore,Coherence>::sendResult(PacketPtr &pkt, MSHR* mshr,
-                                                bool success)
+
+template<class TagStore>
+Tick
+Cache<TagStore>::atomicAccess(PacketPtr pkt)
 {
-    if (success && !(SIGNAL_NACK_HACK)) {
-        //Remember if it was an upgrade because writeback MSHR's are removed
-        //in Mark in Service
-        bool upgrade = (mshr->pkt && mshr->pkt->cmd == MemCmd::UpgradeReq);
+    int lat = hitLatency;
 
-        missQueue->markInService(mshr->pkt, mshr);
+    // @TODO: make this a parameter
+    bool last_level_cache = false;
 
-        //Temp Hack for UPGRADES
-        if (upgrade) {
-            assert(pkt);  //Upgrades need to be fixed
-            pkt->flags &= ~CACHE_LINE_FILL;
+    if (pkt->memInhibitAsserted()) {
+        assert(!pkt->req->isUncacheable());
+        // have to invalidate ourselves and any lower caches even if
+        // upper cache will be responding
+        if (pkt->isInvalidate()) {
             BlkType *blk = tags->findBlock(pkt->getAddr());
-            CacheBlk::State old_state = (blk) ? blk->status : 0;
-            CacheBlk::State new_state = coherence->getNewState(pkt,old_state);
-            //Set the state on the upgrade
-            std::memcpy(pkt->getPtr<uint8_t>(), blk->data, blkSize);
-            PacketList writebacks;
-            handleFill(blk, mshr, new_state, writebacks, pkt);
-            assert(writebacks.empty());
-            if (old_state != new_state)
-                DPRINTF(Cache, "Block addr %x moving from state "
-                        "%i to %i\n", pkt->getAddr(), old_state, new_state);
-            missQueue->handleResponse(pkt, curTick + hitLatency);
+            if (blk && blk->isValid()) {
+                tags->invalidateBlk(blk);
+                DPRINTF(Cache, "rcvd mem-inhibited %s on 0x%x: invalidating\n",
+                        pkt->cmdString(), pkt->getAddr());
+            }
+            if (!last_level_cache) {
+                DPRINTF(Cache, "forwarding mem-inhibited %s on 0x%x\n",
+                        pkt->cmdString(), pkt->getAddr());
+                lat += memSidePort->sendAtomic(pkt);
+            }
+        } else {
+            DPRINTF(Cache, "rcvd mem-inhibited %s on 0x%x: not responding\n",
+                    pkt->cmdString(), pkt->getAddr());
         }
-    } else if (pkt && !pkt->req->isUncacheable()) {
-        pkt->flags &= ~NACKED_LINE;
-        SIGNAL_NACK_HACK = false;
-        pkt->flags &= ~SATISFIED;
-        pkt->flags &= ~SNOOP_COMMIT;
-
-//Rmove copy from mshr
-        delete mshr->pkt;
-        mshr->pkt = pkt;
 
-        missQueue->restoreOrigCmd(pkt);
+        return lat;
     }
-}
 
-template<class TagStore, class Coherence>
-void
-Cache<TagStore,Coherence>::handleResponse(PacketPtr &pkt)
-{
+    // should assert here that there are no outstanding MSHRs or
+    // writebacks... that would mean that someone used an atomic
+    // access in timing mode
+
     BlkType *blk = NULL;
-    if (pkt->senderState) {
-        //Delete temp copy in MSHR, restore it.
-        delete ((MSHR*)pkt->senderState)->pkt;
-        ((MSHR*)pkt->senderState)->pkt = pkt;
-        if (pkt->result == Packet::Nacked) {
-            //pkt->reinitFromRequest();
-            warn("NACKs from devices not connected to the same bus "
-                 "not implemented\n");
-            return;
-        }
-        if (pkt->result == Packet::BadAddress) {
-            //Make the response a Bad address and send it
+
+    if (!access(pkt, blk, lat)) {
+        // MISS
+        PacketPtr busPkt = getBusPacket(pkt, blk, pkt->needsExclusive());
+
+        bool isCacheFill = (busPkt != NULL);
+
+        if (busPkt == NULL) {
+            // just forwarding the same request to the next level
+            // no local cache operation involved
+            busPkt = pkt;
         }
-//	MemDebug::cacheResponse(pkt);
-        DPRINTF(Cache, "Handling response to %x\n", pkt->getAddr());
-
-        if (pkt->isCacheFill() && !pkt->isNoAllocate()) {
-            DPRINTF(Cache, "Block for addr %x being updated in Cache\n",
-                    pkt->getAddr());
-            blk = tags->findBlock(pkt->getAddr());
-            CacheBlk::State old_state = (blk) ? blk->status : 0;
+
+        DPRINTF(Cache, "Sending an atomic %s for %x\n",
+                busPkt->cmdString(), busPkt->getAddr());
+
+#if TRACING_ON
+        CacheBlk::State old_state = blk ? blk->status : 0;
+#endif
+
+        lat += memSidePort->sendAtomic(busPkt);
+
+        DPRINTF(Cache, "Receive response: %s for addr %x in state %i\n",
+                busPkt->cmdString(), busPkt->getAddr(), old_state);
+
+        if (isCacheFill) {
             PacketList writebacks;
-            CacheBlk::State new_state = coherence->getNewState(pkt,old_state);
-            blk = handleFill(blk, (MSHR*)pkt->senderState,
-                                   new_state, writebacks, pkt);
-            if (old_state != new_state)
-                DPRINTF(Cache, "Block addr %x moving from state %i to %i\n",
-                        pkt->getAddr(), old_state, new_state);
-            while (!writebacks.empty()) {
+            blk = handleFill(busPkt, blk, writebacks);
+            satisfyCpuSideRequest(pkt, blk);
+            delete busPkt;
+
+            // Handle writebacks if needed
+            while (!writebacks.empty()){
                 PacketPtr wbPkt = writebacks.front();
-                missQueue->doWriteback(wbPkt);
+                memSidePort->sendAtomic(wbPkt);
                 writebacks.pop_front();
                 delete wbPkt;
             }
         }
-        missQueue->handleResponse(pkt, curTick + hitLatency);
     }
-}
 
-template<class TagStore, class Coherence>
-PacketPtr
-Cache<TagStore,Coherence>::getCoherencePacket()
-{
-    return coherence->getPacket();
+    // We now have the block one way or another (hit or completed miss)
+
+    if (pkt->needsResponse()) {
+        pkt->makeAtomicResponse();
+    }
+
+    return lat;
 }
 
-template<class TagStore, class Coherence>
+
+template<class TagStore>
 void
-Cache<TagStore,Coherence>::sendCoherenceResult(PacketPtr &pkt,
-                                                         MSHR *cshr,
-                                                         bool success)
+Cache<TagStore>::functionalAccess(PacketPtr pkt,
+                                  CachePort *otherSidePort)
 {
-    coherence->sendResult(pkt, cshr, success);
+    Addr blk_addr = pkt->getAddr() & ~(blkSize - 1);
+    BlkType *blk = tags->findBlock(pkt->getAddr());
+
+    if (blk && pkt->checkFunctional(blk_addr, blkSize, blk->data)) {
+        // request satisfied from block
+        return;
+    }
+
+    // Need to check for outstanding misses and writes; if neither one
+    // satisfies, then forward to other side of cache.
+    if (!(mshrQueue.checkFunctional(pkt, blk_addr) ||
+          writeBuffer.checkFunctional(pkt, blk_addr))) {
+        otherSidePort->checkAndSendFunctional(pkt);
+    }
 }
 
 
-template<class TagStore, class Coherence>
+/////////////////////////////////////////////////////
+//
+// Response handling: responses from the memory side
+//
+/////////////////////////////////////////////////////
+
+
+template<class TagStore>
 void
-Cache<TagStore,Coherence>::snoop(PacketPtr &pkt)
+Cache<TagStore>::handleResponse(PacketPtr pkt)
 {
-    if (pkt->req->isUncacheable()) {
-        //Can't get a hit on an uncacheable address
-        //Revisit this for multi level coherence
+    Tick time = curTick + hitLatency;
+    MSHR *mshr = dynamic_cast<MSHR*>(pkt->senderState);
+    assert(mshr);
+
+    if (pkt->wasNacked()) {
+        //pkt->reinitFromRequest();
+        warn("NACKs from devices not connected to the same bus "
+             "not implemented\n");
         return;
     }
+    assert(!pkt->isError());
+    DPRINTF(Cache, "Handling response to %x\n", pkt->getAddr());
 
-    //Send a timing (true) invalidate up if the protocol calls for it
-    if (coherence->propogateInvalidate(pkt, true)) {
-        //Temp hack, we had a functional read hit in the L1, mark as success
-        pkt->flags |= SATISFIED;
-        pkt->result = Packet::Success;
-        respondToSnoop(pkt, curTick + hitLatency);
-        return;
+    MSHRQueue *mq = mshr->queue;
+    bool wasFull = mq->isFull();
+
+    if (mshr == noTargetMSHR) {
+        // we always clear at least one target
+        clearBlocked(Blocked_NoTargets);
+        noTargetMSHR = NULL;
     }
 
-    Addr blk_addr = pkt->getAddr() & ~(Addr(blkSize-1));
+    // Initial target is used just for stats
+    MSHR::Target *initial_tgt = mshr->getTarget();
     BlkType *blk = tags->findBlock(pkt->getAddr());
-    MSHR *mshr = missQueue->findMSHR(blk_addr);
-    if (coherence->hasProtocol() || pkt->isInvalidate()) {
-        //@todo Move this into handle bus req
-        //If we find an mshr, and it is in service, we need to NACK or
-        //invalidate
-        if (mshr) {
-            if (mshr->inService) {
-                if ((mshr->pkt->isInvalidate() || !mshr->pkt->isCacheFill())
-                    && (pkt->cmd != MemCmd::InvalidateReq
-                        && pkt->cmd != MemCmd::WriteInvalidateReq)) {
-                    //If the outstanding request was an invalidate
-                    //(upgrade,readex,..)  Then we need to ACK the request
-                    //until we get the data Also NACK if the outstanding
-                    //request is not a cachefill (writeback)
-                    assert(!(pkt->flags & SATISFIED));
-                    pkt->flags |= SATISFIED;
-                    pkt->flags |= NACKED_LINE;
-                    SIGNAL_NACK_HACK = true;
-                    ///@todo NACK's from other levels
-                    //warn("NACKs from devices not connected to the same bus "
-                    //"not implemented\n");
-                    //respondToSnoop(pkt, curTick + hitLatency);
-                    return;
-                }
-                else {
-                    //The supplier will be someone else, because we are
-                    //waiting for the data.  This should cause this cache to
-                    //be forced to go to the shared state, not the exclusive
-                    //even though the shared line won't be asserted.  But for
-                    //now we will just invlidate ourselves and allow the other
-                    //cache to go into the exclusive state.  @todo Make it so
-                    //a read to a pending read doesn't invalidate.  @todo Make
-                    //it so that a read to a pending read can't be exclusive
-                    //now.
-
-                    //Set the address so find match works
-                    //panic("Don't have invalidates yet\n");
-                    invalidatePkt->addrOverride(pkt->getAddr());
-
-                    //Append the invalidate on
-                    missQueue->addTarget(mshr,invalidatePkt);
-                    DPRINTF(Cache, "Appending Invalidate to addr: %x\n",
-                            pkt->getAddr());
-                    return;
+    int stats_cmd_idx = initial_tgt->pkt->cmdToIndex();
+    Tick miss_latency = curTick - initial_tgt->recvTime;
+    PacketList writebacks;
+
+    if (pkt->req->isUncacheable()) {
+        mshr_uncacheable_lat[stats_cmd_idx][0/*pkt->req->getThreadNum()*/] +=
+            miss_latency;
+    } else {
+        mshr_miss_latency[stats_cmd_idx][0/*pkt->req->getThreadNum()*/] +=
+            miss_latency;
+    }
+
+    if (mshr->isCacheFill) {
+        DPRINTF(Cache, "Block for addr %x being updated in Cache\n",
+                pkt->getAddr());
+
+        // give mshr a chance to do some dirty work
+        mshr->handleFill(pkt, blk);
+
+        blk = handleFill(pkt, blk, writebacks);
+        assert(blk != NULL);
+    }
+
+    // First offset for critical word first calculations
+    int initial_offset = 0;
+
+    if (mshr->hasTargets()) {
+        initial_offset = mshr->getTarget()->pkt->getOffset(blkSize);
+    }
+
+    while (mshr->hasTargets()) {
+        MSHR::Target *target = mshr->getTarget();
+
+        if (target->isCpuSide()) {
+            Tick completion_time;
+            if (blk != NULL) {
+                satisfyCpuSideRequest(target->pkt, blk);
+                // How many bytes past the first request is this one
+                int transfer_offset =
+                    target->pkt->getOffset(blkSize) - initial_offset;
+                if (transfer_offset < 0) {
+                    transfer_offset += blkSize;
                 }
-            }
-        }
-        //We also need to check the writeback buffers and handle those
-        std::vector<MSHR *> writebacks;
-        if (missQueue->findWrites(blk_addr, writebacks)) {
-            DPRINTF(Cache, "Snoop hit in writeback to addr: %x\n",
-                    pkt->getAddr());
-
-            //Look through writebacks for any non-uncachable writes, use that
-            for (int i=0; i<writebacks.size(); i++) {
-                mshr = writebacks[i];
-
-                if (!mshr->pkt->req->isUncacheable()) {
-                    if (pkt->isRead()) {
-                        //Only Upgrades don't get here
-                        //Supply the data
-                        assert(!(pkt->flags & SATISFIED));
-                        pkt->flags |= SATISFIED;
-
-                        //If we are in an exclusive protocol, make it ask again
-                        //to get write permissions (upgrade), signal shared
-                        pkt->flags |= SHARED_LINE;
-
-                        assert(pkt->isRead());
-                        Addr offset = pkt->getAddr() & (blkSize - 1);
-                        assert(offset < blkSize);
-                        assert(pkt->getSize() <= blkSize);
-                        assert(offset + pkt->getSize() <=blkSize);
-                        std::memcpy(pkt->getPtr<uint8_t>(), mshr->pkt->getPtr<uint8_t>() + offset, pkt->getSize());
-
-                        respondToSnoop(pkt, curTick + hitLatency);
-                    }
-
-                    if (pkt->isInvalidate()) {
-                        //This must be an upgrade or other cache will take
-                        //ownership
-                        missQueue->markInService(mshr->pkt, mshr);
-                    }
-                    return;
+
+                // If critical word (no offset) return first word time
+                completion_time = tags->getHitLatency() +
+                    transfer_offset ? pkt->finishTime : pkt->firstWordTime;
+
+                assert(!target->pkt->req->isUncacheable());
+                missLatency[target->pkt->cmdToIndex()][0/*pkt->req->getThreadNum()*/] +=
+                    completion_time - target->recvTime;
+            } else {
+                // not a cache fill, just forwarding response
+                completion_time = tags->getHitLatency() + pkt->finishTime;
+                if (pkt->isRead()) {
+                    target->pkt->setData(pkt->getPtr<uint8_t>());
                 }
             }
+            target->pkt->makeTimingResponse();
+            cpuSidePort->respond(target->pkt, completion_time);
+        } else {
+            // response to snoop request
+            DPRINTF(Cache, "processing deferred snoop...\n");
+            handleSnoop(target->pkt, blk, true, true);
         }
-    }
-    CacheBlk::State new_state;
-    bool satisfy = coherence->handleBusRequest(pkt,blk,mshr, new_state);
 
-    if (blk && mshr && !mshr->inService && new_state == 0) {
-            //There was a outstanding write to a shared block, not need ReadEx
-            //not update, so change No Allocate param in MSHR
-            mshr->pkt->flags &= ~NO_ALLOCATE;
+        mshr->popTarget();
     }
 
-    if (satisfy) {
-        DPRINTF(Cache, "snooped a %s request for addr %x and "
-                "now supplying data, new state is %i\n",
-                pkt->cmdString(), blk_addr, new_state);
+    if (mshr->promoteDeferredTargets()) {
+        MSHRQueue *mq = mshr->queue;
+        mq->markPending(mshr);
+        requestMemSideBus((RequestCause)mq->index, pkt->finishTime);
+    } else {
+        mq->deallocate(mshr);
+        if (wasFull && !mq->isFull()) {
+            clearBlocked((BlockedCause)mq->index);
+        }
+    }
 
-        handleSnoop(blk, new_state, pkt);
-        respondToSnoop(pkt, curTick + hitLatency);
-        return;
+    // copy writebacks to write buffer
+    while (!writebacks.empty()) {
+        PacketPtr wbPkt = writebacks.front();
+        allocateWriteBuffer(wbPkt, time, true);
+        writebacks.pop_front();
+    }
+    // if we used temp block, clear it out
+    if (blk == tempBlock) {
+        if (blk->isDirty()) {
+            allocateWriteBuffer(writebackBlk(blk), time, true);
+        }
+        tags->invalidateBlk(blk);
     }
-    if (blk)
-        DPRINTF(Cache, "snooped a %s request for addr %x, "
-                "new state is %i\n", pkt->cmdString(), blk_addr, new_state);
 
-    handleSnoop(blk, new_state);
+    delete pkt;
 }
 
-template<class TagStore, class Coherence>
-void
-Cache<TagStore,Coherence>::snoopResponse(PacketPtr &pkt)
+
+
+
+template<class TagStore>
+PacketPtr
+Cache<TagStore>::writebackBlk(BlkType *blk)
 {
-    //Need to handle the response, if NACKED
-    if (pkt->flags & NACKED_LINE) {
-        //Need to mark it as not in service, and retry for bus
-        assert(0); //Yeah, we saw a NACK come through
-
-        //For now this should never get called, we return false when we see a
-        //NACK instead, by doing this we allow the bus_blocked mechanism to
-        //handle the retry For now it retrys in just 2 cycles, need to figure
-        //out how to change that Eventually we will want to also have success
-        //come in as a parameter Need to make sure that we handle the
-        //functionality that happens on successufl return of the sendAddr
-        //function
-    }
+    assert(blk && blk->isValid() && blk->isDirty());
+
+    writebacks[0/*pkt->req->getThreadNum()*/]++;
+
+    Request *writebackReq =
+        new Request(tags->regenerateBlkAddr(blk->tag, blk->set), blkSize, 0);
+    PacketPtr writeback = new Packet(writebackReq, MemCmd::Writeback, -1);
+    writeback->allocate();
+    std::memcpy(writeback->getPtr<uint8_t>(), blk->data, blkSize);
+
+    blk->status &= ~BlkDirty;
+    return writeback;
 }
 
 
-/**
- * @todo Fix to not assume write allocate
- */
-template<class TagStore, class Coherence>
-Tick
-Cache<TagStore,Coherence>::probe(PacketPtr &pkt, bool update,
-                                           CachePort* otherSidePort)
+// Note that the reason we return a list of writebacks rather than
+// inserting them directly in the write buffer is that this function
+// is called by both atomic and timing-mode accesses, and in atomic
+// mode we don't mess with the write buffer (we just perform the
+// writebacks atomically once the original request is complete).
+template<class TagStore>
+typename Cache<TagStore>::BlkType*
+Cache<TagStore>::handleFill(PacketPtr pkt, BlkType *blk,
+                            PacketList &writebacks)
 {
-//    MemDebug::cacheProbe(pkt);
-    if (!pkt->req->isUncacheable()) {
-        if (pkt->isInvalidate() && !pkt->isRead() && !pkt->isWrite()) {
-            //Upgrade or Invalidate, satisfy it, don't forward
-            DPRINTF(Cache, "%s %x ?\n", pkt->cmdString(), pkt->getAddr());
-            pkt->flags |= SATISFIED;
-            return 0;
+    Addr addr = pkt->getAddr();
+#if TRACING_ON
+    CacheBlk::State old_state = blk ? blk->status : 0;
+#endif
+
+    if (blk == NULL) {
+        // better have read new data...
+        assert(pkt->isRead());
+
+        // need to do a replacement
+        blk = tags->findReplacement(addr, writebacks);
+        if (blk->isValid()) {
+            Addr repl_addr = tags->regenerateBlkAddr(blk->tag, blk->set);
+            MSHR *repl_mshr = mshrQueue.findMatch(repl_addr);
+            if (repl_mshr) {
+                // must be an outstanding upgrade request on block
+                // we're about to replace...
+                assert(!blk->isWritable());
+                assert(repl_mshr->needsExclusive());
+                // too hard to replace block with transient state;
+                // just use temporary storage to complete the current
+                // request and then get rid of it
+                assert(!tempBlock->isValid());
+                blk = tempBlock;
+                tempBlock->set = tags->extractSet(addr);
+                DPRINTF(Cache, "using temp block for %x\n", addr);
+            } else {
+                DPRINTF(Cache, "replacement: replacing %x with %x: %s\n",
+                        repl_addr, addr,
+                        blk->isDirty() ? "writeback" : "clean");
+
+                if (blk->isDirty()) {
+                    // Save writeback packet for handling by caller
+                    writebacks.push_back(writebackBlk(blk));
+                }
+            }
         }
+
+        blk->tag = tags->extractTag(addr);
+    } else {
+        // existing block... probably an upgrade
+        assert(blk->tag == tags->extractTag(addr));
+        // either we're getting new data or the block should already be valid
+        assert(pkt->isRead() || blk->isValid());
     }
 
-    if (!update && (otherSidePort == cpuSidePort)) {
-        // Still need to change data in all locations.
-        otherSidePort->checkAndSendFunctional(pkt);
-        if (pkt->isRead() && pkt->result == Packet::Success)
-            return 0;
+    if (pkt->needsExclusive() || !pkt->sharedAsserted()) {
+        blk->status = BlkValid | BlkWritable;
+    } else {
+        blk->status = BlkValid;
     }
 
-    PacketList writebacks;
-    int lat;
+    DPRINTF(Cache, "Block addr %x moving from state %i to %i\n",
+            addr, old_state, blk->status);
 
-    BlkType *blk = handleAccess(pkt, lat, writebacks, update);
+    // if we got new data, copy it in
+    if (pkt->isRead()) {
+        std::memcpy(blk->data, pkt->getPtr<uint8_t>(), blkSize);
+    }
 
-    DPRINTF(Cache, "%s %x %s\n", pkt->cmdString(),
-            pkt->getAddr(), (blk) ? "hit" : "miss");
+    blk->whenReady = pkt->finishTime;
 
+    return blk;
+}
 
-    // Need to check for outstanding misses and writes
-    Addr blk_addr = pkt->getAddr() & ~(blkSize - 1);
 
-    // There can only be one matching outstanding miss.
-    MSHR* mshr = missQueue->findMSHR(blk_addr);
+/////////////////////////////////////////////////////
+//
+// Snoop path: requests coming in from the memory side
+//
+/////////////////////////////////////////////////////
 
-    // There can be many matching outstanding writes.
-    std::vector<MSHR*> writes;
-    missQueue->findWrites(blk_addr, writes);
+template<class TagStore>
+void
+Cache<TagStore>::doTimingSupplyResponse(PacketPtr req_pkt,
+                                        uint8_t *blk_data,
+                                        bool already_copied)
+{
+    // timing-mode snoop responses require a new packet, unless we
+    // already made a copy...
+    PacketPtr pkt = already_copied ? req_pkt : new Packet(req_pkt, true);
+    if (!req_pkt->isInvalidate()) {
+        // note that we're ignoring the shared flag on req_pkt... it's
+        // basically irrelveant, as we'll always assert shared unless
+        // it's an exclusive request, in which case the shared line
+        // should never be asserted1
+        pkt->assertShared();
+    }
+    pkt->allocate();
+    pkt->makeTimingResponse();
+    if (pkt->isRead()) {
+        pkt->setDataFromBlock(blk_data, blkSize);
+    }
+    memSidePort->respond(pkt, curTick + hitLatency);
+}
 
-    if (!update) {
-        bool notDone = !(pkt->flags & SATISFIED); //Hit in cache (was a block)
-        // Check for data in MSHR and writebuffer.
-        if (mshr) {
-            MSHR::TargetList *targets = mshr->getTargetList();
-            MSHR::TargetList::iterator i = targets->begin();
-            MSHR::TargetList::iterator end = targets->end();
-            for (; i != end && notDone; ++i) {
-                PacketPtr target = *i;
-                // If the target contains data, and it overlaps the
-                // probed request, need to update data
-                if (target->intersect(pkt)) {
-                    DPRINTF(Cache, "Functional %s access to blk_addr %x intersects a MSHR\n",
-                            pkt->cmdString(), blk_addr);
-                    notDone = fixPacket(pkt, target);
-                }
-            }
-        }
-        for (int i = 0; i < writes.size() && notDone; ++i) {
-            PacketPtr write = writes[i]->pkt;
-            if (write->intersect(pkt)) {
-                DPRINTF(Cache, "Functional %s access to blk_addr %x intersects a writeback\n",
-                        pkt->cmdString(), blk_addr);
-                notDone = fixPacket(pkt, write);
-            }
+template<class TagStore>
+void
+Cache<TagStore>::handleSnoop(PacketPtr pkt, BlkType *blk,
+                             bool is_timing, bool is_deferred)
+{
+    assert(pkt->isRequest());
+
+    // first propagate snoop upward to see if anyone above us wants to
+    // handle it.  save & restore packet src since it will get
+    // rewritten to be relative to cpu-side bus (if any)
+    bool alreadyResponded = pkt->memInhibitAsserted();
+    if (is_timing) {
+        Packet *snoopPkt = new Packet(pkt, true);  // clear flags
+        snoopPkt->setExpressSnoop();
+        snoopPkt->senderState = new ForwardResponseRecord(pkt, this);
+        cpuSidePort->sendTiming(snoopPkt);
+        if (snoopPkt->memInhibitAsserted()) {
+            // cache-to-cache response from some upper cache
+            assert(!alreadyResponded);
+            pkt->assertMemInhibit();
+        } else {
+            delete snoopPkt->senderState;
         }
-        if (notDone && otherSidePort == memSidePort) {
-            otherSidePort->checkAndSendFunctional(pkt);
-            assert(pkt->result == Packet::Success);
+        if (snoopPkt->sharedAsserted()) {
+            pkt->assertShared();
         }
-        return 0;
-    } else if (!blk && !(pkt->flags & SATISFIED)) {
-        // update the cache state and statistics
-        if (mshr || !writes.empty()){
-            // Can't handle it, return request unsatisfied.
-            panic("Atomic access ran into outstanding MSHR's or WB's!");
+        delete snoopPkt;
+    } else {
+        int origSrc = pkt->getSrc();
+        cpuSidePort->sendAtomic(pkt);
+        if (!alreadyResponded && pkt->memInhibitAsserted()) {
+            // cache-to-cache response from some upper cache:
+            // forward response to original requester
+            assert(pkt->isResponse());
         }
-        if (!pkt->req->isUncacheable() /*Uncacheables just go through*/
-            && (pkt->cmd != MemCmd::Writeback)/*Writebacks on miss fall through*/) {
-                // Fetch the cache block to fill
-            BlkType *blk = tags->findBlock(pkt->getAddr());
-            MemCmd temp_cmd =
-                coherence->getBusCmd(pkt->cmd, (blk) ? blk->status : 0);
-
-            PacketPtr busPkt = new Packet(pkt->req,temp_cmd, -1, blkSize);
-
-            busPkt->allocate();
+        pkt->setSrc(origSrc);
+    }
 
-            busPkt->time = curTick;
+    if (!blk || !blk->isValid()) {
+        return;
+    }
 
-            DPRINTF(Cache, "Sending a atomic %s for %x\n",
-                    busPkt->cmdString(), busPkt->getAddr());
+    // we may end up modifying both the block state and the packet (if
+    // we respond in atomic mode), so just figure out what to do now
+    // and then do it later
+    bool respond = blk->isDirty() && pkt->needsResponse();
+    bool have_exclusive = blk->isWritable();
+    bool invalidate = pkt->isInvalidate();
+
+    if (pkt->isRead() && !pkt->isInvalidate()) {
+        assert(!pkt->needsExclusive());
+        pkt->assertShared();
+        int bits_to_clear = BlkWritable;
+        const bool haveOwnershipState = true; // for now
+        if (!haveOwnershipState) {
+            // if we don't support pure ownership (dirty && !writable),
+            // have to clear dirty bit here, assume memory snarfs data
+            // on cache-to-cache xfer
+            bits_to_clear |= BlkDirty;
+        }
+        blk->status &= ~bits_to_clear;
+    }
 
-            lat = memSidePort->sendAtomic(busPkt);
+    if (respond) {
+        assert(!pkt->memInhibitAsserted());
+        pkt->assertMemInhibit();
+        if (have_exclusive) {
+            pkt->setSupplyExclusive();
+        }
+        if (is_timing) {
+            doTimingSupplyResponse(pkt, blk->data, is_deferred);
+        } else {
+            pkt->makeAtomicResponse();
+            pkt->setDataFromBlock(blk->data, blkSize);
+        }
+    }
 
-            //Be sure to flip the response to a request for coherence
-            if (busPkt->needsResponse()) {
-                busPkt->makeAtomicResponse();
-            }
+    // Do this last in case it deallocates block data or something
+    // like that
+    if (invalidate) {
+        tags->invalidateBlk(blk);
+    }
 
-/*		if (!(busPkt->flags & SATISFIED)) {
-// blocked at a higher level, just return
-return 0;
+    DPRINTF(Cache, "snooped a %s request for addr %x, %snew state is %i\n",
+            pkt->cmdString(), blockAlign(pkt->getAddr()),
+            respond ? "responding, " : "", blk->status);
 }
 
-*/		misses[pkt->cmdToIndex()][0/*pkt->req->getThreadNum()*/]++;
 
-            CacheBlk::State old_state = (blk) ? blk->status : 0;
-            CacheBlk::State new_state =
-                coherence->getNewState(busPkt, old_state);
-            DPRINTF(Cache, "Receive response: %s for addr %x in state %i\n",
-                    busPkt->cmdString(), busPkt->getAddr(), old_state);
+template<class TagStore>
+void
+Cache<TagStore>::snoopTiming(PacketPtr pkt)
+{
+    // Note that some deferred snoops don't have requests, since the
+    // original access may have already completed
+    if ((pkt->req && pkt->req->isUncacheable()) ||
+        pkt->cmd == MemCmd::Writeback) {
+        //Can't get a hit on an uncacheable address
+        //Revisit this for multi level coherence
+        return;
+    }
 
-            handleFill(blk, busPkt, new_state, writebacks, pkt);
-            if (old_state != new_state)
-                DPRINTF(Cache, "Block addr %x moving from state "
-                        "%i to %i\n", busPkt->getAddr(), old_state, new_state);
-            //Free the packet
-            delete busPkt;
+    BlkType *blk = tags->findBlock(pkt->getAddr());
 
-            // Handle writebacks if needed
-            while (!writebacks.empty()){
-                PacketPtr wbPkt = writebacks.front();
-                memSidePort->sendAtomic(wbPkt);
-                writebacks.pop_front();
-                delete wbPkt;
+    Addr blk_addr = pkt->getAddr() & ~(Addr(blkSize-1));
+    MSHR *mshr = mshrQueue.findMatch(blk_addr);
+
+    // Let the MSHR itself track the snoop and decide whether we want
+    // to go ahead and do the regular cache snoop
+    if (mshr && mshr->handleSnoop(pkt, order++)) {
+        DPRINTF(Cache, "Deferring snoop on in-service MSHR to blk %x\n",
+                blk_addr);
+        if (mshr->getNumTargets() > numTarget)
+            warn("allocating bonus target for snoop"); //handle later
+        return;
+    }
+
+    //We also need to check the writeback buffers and handle those
+    std::vector<MSHR *> writebacks;
+    if (writeBuffer.findMatches(blk_addr, writebacks)) {
+        DPRINTF(Cache, "Snoop hit in writeback to addr: %x\n",
+                pkt->getAddr());
+
+        //Look through writebacks for any non-uncachable writes, use that
+        for (int i=0; i<writebacks.size(); i++) {
+            mshr = writebacks[i];
+            assert(!mshr->isUncacheable());
+            assert(mshr->getNumTargets() == 1);
+            PacketPtr wb_pkt = mshr->getTarget()->pkt;
+            assert(wb_pkt->cmd == MemCmd::Writeback);
+
+            assert(!pkt->memInhibitAsserted());
+            pkt->assertMemInhibit();
+            if (!pkt->needsExclusive()) {
+                pkt->assertShared();
+            } else {
+                // if we're not asserting the shared line, we need to
+                // invalidate our copy.  we'll do that below as long as
+                // the packet's invalidate flag is set...
+                assert(pkt->isInvalidate());
             }
-                return lat + hitLatency;
-        } else {
-            return memSidePort->sendAtomic(pkt);
-        }
-    } else {
-        if (blk) {
-            // There was a cache hit.
-            // Handle writebacks if needed
-            while (!writebacks.empty()){
-                PacketPtr wbPkt = writebacks.front();
-                memSidePort->sendAtomic(wbPkt);
-                writebacks.pop_front();
-                delete wbPkt;
+            doTimingSupplyResponse(pkt, wb_pkt->getPtr<uint8_t>(), false);
+
+            if (pkt->isInvalidate()) {
+                // Invalidation trumps our writeback... discard here
+                markInService(mshr);
             }
 
-            hits[pkt->cmdToIndex()][0/*pkt->req->getThreadNum()*/]++;
+            // If this was a shared writeback, there may still be
+            // other shared copies above that require invalidation.
+            // We could be more selective and return here if the
+            // request is non-exclusive or if the writeback is
+            // exclusive.
+            break;
         }
-
-        return hitLatency;
     }
 
-    return 0;
+    handleSnoop(pkt, blk, true, false);
 }
 
-template<class TagStore, class Coherence>
+
+template<class TagStore>
 Tick
-Cache<TagStore,Coherence>::snoopProbe(PacketPtr &pkt)
+Cache<TagStore>::snoopAtomic(PacketPtr pkt)
 {
-    //Send a atomic (false) invalidate up if the protocol calls for it
-    if (coherence->propogateInvalidate(pkt, false)) {
-        //Temp hack, we had a functional read hit in the L1, mark as success
-        pkt->flags |= SATISFIED;
-        pkt->result = Packet::Success;
+    if (pkt->req->isUncacheable() || pkt->cmd == MemCmd::Writeback) {
+        // Can't get a hit on an uncacheable address
+        // Revisit this for multi level coherence
         return hitLatency;
     }
 
-    Addr blk_addr = pkt->getAddr() & ~(Addr(blkSize-1));
     BlkType *blk = tags->findBlock(pkt->getAddr());
-    MSHR *mshr = missQueue->findMSHR(blk_addr);
-    CacheBlk::State new_state = 0;
-    bool satisfy = coherence->handleBusRequest(pkt,blk,mshr, new_state);
-    if (satisfy) {
-        DPRINTF(Cache, "Cache snooped a %s request for addr %x, "
-                "supplying data, new state is %i\n",
-                pkt->cmdString(), blk_addr, new_state);
-
-            handleSnoop(blk, new_state, pkt);
-            return hitLatency;
-    }
-    if (blk)
-        DPRINTF(Cache, "Cache snooped a %s request for addr %x, "
-                "new state is %i\n",
-                    pkt->cmdString(), blk_addr, new_state);
-    handleSnoop(blk, new_state);
-    return 0;
+    handleSnoop(pkt, blk, false, false);
+    return hitLatency;
 }
 
-template<class TagStore, class Coherence>
-Port *
-Cache<TagStore,Coherence>::getPort(const std::string &if_name, int idx)
+
+template<class TagStore>
+MSHR *
+Cache<TagStore>::getNextMSHR()
 {
-    if (if_name == "" || if_name == "cpu_side")
-    {
-        if (cpuSidePort == NULL) {
-            cpuSidePort = new CpuSidePort(name() + "-cpu_side_port", this);
-            sendEvent = new ResponseEvent(cpuSidePort);
+    // Check both MSHR queue and write buffer for potential requests
+    MSHR *miss_mshr  = mshrQueue.getNextMSHR();
+    MSHR *write_mshr = writeBuffer.getNextMSHR();
+
+    // Now figure out which one to send... some cases are easy
+    if (miss_mshr && !write_mshr) {
+        return miss_mshr;
+    }
+    if (write_mshr && !miss_mshr) {
+        return write_mshr;
+    }
+
+    if (miss_mshr && write_mshr) {
+        // We have one of each... normally we favor the miss request
+        // unless the write buffer is full
+        if (writeBuffer.isFull() && writeBuffer.inServiceEntries == 0) {
+            // Write buffer is full, so we'd like to issue a write;
+            // need to search MSHR queue for conflicting earlier miss.
+            MSHR *conflict_mshr =
+                mshrQueue.findPending(write_mshr->addr, write_mshr->size);
+
+            if (conflict_mshr && conflict_mshr->order < write_mshr->order) {
+                // Service misses in order until conflict is cleared.
+                return conflict_mshr;
+            }
+
+            // No conflicts; issue write
+            return write_mshr;
         }
-        return cpuSidePort;
+
+        // Write buffer isn't full, but need to check it for
+        // conflicting earlier writeback
+        MSHR *conflict_mshr =
+            writeBuffer.findPending(miss_mshr->addr, miss_mshr->size);
+        if (conflict_mshr) {
+            // not sure why we don't check order here... it was in the
+            // original code but commented out.
+
+            // The only way this happens is if we are
+            // doing a write and we didn't have permissions
+            // then subsequently saw a writeback (owned got evicted)
+            // We need to make sure to perform the writeback first
+            // To preserve the dirty data, then we can issue the write
+
+            // should we return write_mshr here instead?  I.e. do we
+            // have to flush writes in order?  I don't think so... not
+            // for Alpha anyway.  Maybe for x86?
+            return conflict_mshr;
+        }
+
+        // No conflicts; issue read
+        return miss_mshr;
     }
-    else if (if_name == "functional")
-    {
-        return new CpuSidePort(name() + "-cpu_side_funcport", this);
+
+    // fall through... no pending requests.  Try a prefetch.
+    assert(!miss_mshr && !write_mshr);
+    if (!mshrQueue.isFull()) {
+        // If we have a miss queue slot, we can try a prefetch
+        PacketPtr pkt = prefetcher->getPacket();
+        if (pkt) {
+            // Update statistic on number of prefetches issued
+            // (hwpf_mshr_misses)
+            mshr_misses[pkt->cmdToIndex()][0/*pkt->req->getThreadNum()*/]++;
+            // Don't request bus, since we already have it
+            return allocateMissBuffer(pkt, curTick, false);
+        }
     }
-    else if (if_name == "mem_side")
-    {
-        if (memSidePort != NULL)
-            panic("Already have a mem side for this cache\n");
-        memSidePort = new MemSidePort(name() + "-mem_side_port", this);
-        memSendEvent = new ResponseEvent(memSidePort);
-        return memSidePort;
+
+    return NULL;
+}
+
+
+template<class TagStore>
+PacketPtr
+Cache<TagStore>::getTimingPacket()
+{
+    MSHR *mshr = getNextMSHR();
+
+    if (mshr == NULL) {
+        return NULL;
+    }
+
+    // use request from 1st target
+    PacketPtr tgt_pkt = mshr->getTarget()->pkt;
+    PacketPtr pkt = NULL;
+
+    if (mshr->isSimpleForward()) {
+        // no response expected, just forward packet as it is
+        assert(tags->findBlock(mshr->addr) == NULL);
+        pkt = tgt_pkt;
+    } else {
+        BlkType *blk = tags->findBlock(mshr->addr);
+        pkt = getBusPacket(tgt_pkt, blk, mshr->needsExclusive());
+
+        mshr->isCacheFill = (pkt != NULL);
+
+        if (pkt == NULL) {
+            // not a cache block request, but a response is expected
+            assert(!mshr->isSimpleForward());
+            // make copy of current packet to forward, keep current
+            // copy for response handling
+            pkt = new Packet(tgt_pkt);
+            pkt->allocate();
+            if (pkt->isWrite()) {
+                pkt->setData(tgt_pkt->getPtr<uint8_t>());
+            }
+        }
     }
-    else panic("Port name %s unrecognized\n", if_name);
+
+    assert(pkt != NULL);
+    pkt->senderState = mshr;
+    return pkt;
 }
 
-template<class TagStore, class Coherence>
+
+///////////////
+//
+// CpuSidePort
+//
+///////////////
+
+template<class TagStore>
 void
-Cache<TagStore,Coherence>::deletePortRefs(Port *p)
+Cache<TagStore>::CpuSidePort::
+getDeviceAddressRanges(AddrRangeList &resp, bool &snoop)
 {
-    if (cpuSidePort == p || memSidePort == p)
-        panic("Can only delete functional ports\n");
-
-    delete p;
+    // CPU side port doesn't snoop; it's a target only.
+    bool dummy;
+    otherPort->getPeerAddressRanges(resp, dummy);
+    snoop = false;
 }
 
 
-template<class TagStore, class Coherence>
+template<class TagStore>
 bool
-Cache<TagStore,Coherence>::CpuSidePort::recvTiming(PacketPtr pkt)
+Cache<TagStore>::CpuSidePort::recvTiming(PacketPtr pkt)
 {
-    assert(pkt->result != Packet::Nacked);
-
-    if (!pkt->req->isUncacheable()
-        && pkt->isInvalidate()
-        && !pkt->isRead() && !pkt->isWrite()) {
-        //Upgrade or Invalidate
-        //Look into what happens if two slave caches on bus
-        DPRINTF(Cache, "%s %x ?\n", pkt->cmdString(), pkt->getAddr());
-
-        assert(!(pkt->flags & SATISFIED));
-        pkt->flags |= SATISFIED;
-        //Invalidates/Upgrades need no response if they get the bus
-        return true;
-    }
-
-    if (pkt->isRequest() && blocked)
-    {
+    // illegal to block responses... can lead to deadlock
+    if (pkt->isRequest() && !pkt->memInhibitAsserted() && blocked) {
         DPRINTF(Cache,"Scheduling a retry while blocked\n");
         mustSendRetry = true;
         return false;
     }
 
-    if (pkt->isWrite() && (pkt->req->isLocked())) {
-        pkt->req->setExtraData(1);
-    }
-    myCache()->access(pkt);
+    myCache()->timingAccess(pkt);
     return true;
 }
 
-template<class TagStore, class Coherence>
+
+template<class TagStore>
 Tick
-Cache<TagStore,Coherence>::CpuSidePort::recvAtomic(PacketPtr pkt)
+Cache<TagStore>::CpuSidePort::recvAtomic(PacketPtr pkt)
 {
-    myCache()->probe(pkt, true, NULL);
-    //TEMP ALWAYS SUCCES FOR NOW
-    pkt->result = Packet::Success;
-    //Fix this timing info
-    return myCache()->hitLatency;
+    return myCache()->atomicAccess(pkt);
 }
 
-template<class TagStore, class Coherence>
+
+template<class TagStore>
 void
-Cache<TagStore,Coherence>::CpuSidePort::recvFunctional(PacketPtr pkt)
+Cache<TagStore>::CpuSidePort::recvFunctional(PacketPtr pkt)
 {
-    if (checkFunctional(pkt)) {
-        //TEMP USE CPU?THREAD 0 0
-        pkt->req->setThreadContext(0,0);
-
-        myCache()->probe(pkt, false, cache->memSidePort);
-        //TEMP ALWAYS SUCCESFUL FOR NOW
-        pkt->result = Packet::Success;
+    if (!checkFunctional(pkt)) {
+        myCache()->functionalAccess(pkt, cache->memSidePort);
     }
 }
 
 
-template<class TagStore, class Coherence>
+template<class TagStore>
+Cache<TagStore>::
+CpuSidePort::CpuSidePort(const std::string &_name,
+                         Cache<TagStore> *_cache)
+    : BaseCache::CachePort(_name, _cache)
+{
+}
+
+///////////////
+//
+// MemSidePort
+//
+///////////////
+
+template<class TagStore>
+void
+Cache<TagStore>::MemSidePort::
+getDeviceAddressRanges(AddrRangeList &resp, bool &snoop)
+{
+    otherPort->getPeerAddressRanges(resp, snoop);
+    // Memory-side port always snoops, so unconditionally set flag for
+    // caller.
+    snoop = true;
+}
+
+
+template<class TagStore>
 bool
-Cache<TagStore,Coherence>::MemSidePort::recvTiming(PacketPtr pkt)
+Cache<TagStore>::MemSidePort::recvTiming(PacketPtr pkt)
 {
     // this needs to be fixed so that the cache updates the mshr and sends the
     // packet back out on the link, but it probably won't happen so until this
     // gets fixed, just panic when it does
-    if (pkt->result == Packet::Nacked)
+    if (pkt->wasNacked())
         panic("Need to implement cache resending nacked packets!\n");
 
-    if (pkt->isRequest() && blocked)
-    {
+    if (pkt->isRequest() && blocked) {
         DPRINTF(Cache,"Scheduling a retry while blocked\n");
         mustSendRetry = true;
         return false;
     }
 
-    if (pkt->isResponse())
+    if (pkt->isResponse()) {
         myCache()->handleResponse(pkt);
-    else {
-        //Check if we should do the snoop
-        if (pkt->flags & SNOOP_COMMIT)
-            myCache()->snoop(pkt);
+    } else {
+        myCache()->snoopTiming(pkt);
     }
     return true;
 }
 
-template<class TagStore, class Coherence>
+
+template<class TagStore>
 Tick
-Cache<TagStore,Coherence>::MemSidePort::recvAtomic(PacketPtr pkt)
+Cache<TagStore>::MemSidePort::recvAtomic(PacketPtr pkt)
 {
-    if (pkt->isResponse())
-        myCache()->handleResponse(pkt);
-    else
-        return myCache()->snoopProbe(pkt);
-    //Fix this timing info
-    return myCache()->hitLatency;
+    // in atomic mode, responses go back to the sender via the
+    // function return from sendAtomic(), not via a separate
+    // sendAtomic() from the responder.  Thus we should never see a
+    // response packet in recvAtomic() (anywhere, not just here).
+    assert(!pkt->isResponse());
+    return myCache()->snoopAtomic(pkt);
 }
 
-template<class TagStore, class Coherence>
+
+template<class TagStore>
 void
-Cache<TagStore,Coherence>::MemSidePort::recvFunctional(PacketPtr pkt)
+Cache<TagStore>::MemSidePort::recvFunctional(PacketPtr pkt)
 {
-    myCache()->probe(pkt, false, cache->cpuSidePort);
-    if (pkt->result != Packet::Success)
-        checkFunctional(pkt);
+    if (!checkFunctional(pkt)) {
+        myCache()->functionalAccess(pkt, cache->cpuSidePort);
+    }
 }
 
 
-template<class TagStore, class Coherence>
-Cache<TagStore,Coherence>::
-CpuSidePort::CpuSidePort(const std::string &_name,
-                         Cache<TagStore,Coherence> *_cache)
-    : BaseCache::CachePort(_name, _cache, true)
+
+template<class TagStore>
+void
+Cache<TagStore>::MemSidePort::sendPacket()
 {
+    // if we have responses that are ready, they take precedence
+    if (deferredPacketReady()) {
+        bool success = sendTiming(transmitList.front().pkt);
+
+        if (success) {
+            //send successful, remove packet
+            transmitList.pop_front();
+        }
+
+        waitingOnRetry = !success;
+    } else {
+        // check for non-response packets (requests & writebacks)
+        PacketPtr pkt = myCache()->getTimingPacket();
+        if (pkt == NULL) {
+            // can happen if e.g. we attempt a writeback and fail, but
+            // before the retry, the writeback is eliminated because
+            // we snoop another cache's ReadEx.
+            waitingOnRetry = false;
+        } else {
+            MSHR *mshr = dynamic_cast<MSHR*>(pkt->senderState);
+
+            bool success = sendTiming(pkt);
+            DPRINTF(CachePort,
+                    "Address %x was %s in sending the timing request\n",
+                    pkt->getAddr(), success ? "successful" : "unsuccessful");
+
+            waitingOnRetry = !success;
+            if (waitingOnRetry) {
+                DPRINTF(CachePort, "now waiting on a retry\n");
+                if (!mshr->isSimpleForward()) {
+                    delete pkt;
+                }
+            } else {
+                myCache()->markInService(mshr);
+            }
+        }
+    }
+
+
+    // tried to send packet... if it was successful (no retry), see if
+    // we need to rerequest bus or not
+    if (!waitingOnRetry) {
+        Tick nextReady = std::min(deferredPacketReadyTime(),
+                                  myCache()->nextMSHRReadyTime());
+        // @TODO: need to facotr in prefetch requests here somehow
+        if (nextReady != MaxTick) {
+            DPRINTF(CachePort, "more packets to send @ %d\n", nextReady);
+            sendEvent->schedule(std::max(nextReady, curTick + 1));
+        } else {
+            // no more to send right now: if we're draining, we may be done
+            if (drainEvent) {
+                drainEvent->process();
+                drainEvent = NULL;
+            }
+        }
+    }
+}
+
+template<class TagStore>
+void
+Cache<TagStore>::MemSidePort::recvRetry()
+{
+    assert(waitingOnRetry);
+    sendPacket();
 }
 
-template<class TagStore, class Coherence>
-Cache<TagStore,Coherence>::
-MemSidePort::MemSidePort(const std::string &_name,
-                         Cache<TagStore,Coherence> *_cache)
-    : BaseCache::CachePort(_name, _cache, false)
+
+template<class TagStore>
+void
+Cache<TagStore>::MemSidePort::processSendEvent()
 {
+    assert(!waitingOnRetry);
+    sendPacket();
 }
 
+
+template<class TagStore>
+Cache<TagStore>::
+MemSidePort::MemSidePort(const std::string &_name, Cache<TagStore> *_cache)
+    : BaseCache::CachePort(_name, _cache)
+{
+    // override default send event from SimpleTimingPort
+    delete sendEvent;
+    sendEvent = new SendEvent(this);
+}