20 files changed, 896 insertions, 279 deletions
diff --git a/src/mem/bus.cc b/src/mem/bus.cc
index cf9e54e62..75ffed0d2 100644
--- a/src/mem/bus.cc
+++ b/src/mem/bus.cc
@@ -61,12 +61,79 @@ Bus::getPort(const std::string &if_name, int idx)
 void
 Bus::init()
 {
-    std::vector<Port*>::iterator intIter;
+    std::vector<BusPort*>::iterator intIter;
 
     for (intIter = interfaces.begin(); intIter != interfaces.end(); intIter++)
         (*intIter)->sendStatusChange(Port::RangeChange);
 }
 
+Bus::BusFreeEvent::BusFreeEvent(Bus *_bus) : Event(&mainEventQueue), bus(_bus)
+{}
+
+void Bus::BusFreeEvent::process()
+{
+    bus->recvRetry(-1);
+}
+
+const char * Bus::BusFreeEvent::description()
+{
+    return "bus became available";
+}
+
+void Bus::occupyBus(PacketPtr pkt)
+{
+    //Bring tickNextIdle up to the present tick
+    //There is some potential ambiguity where a cycle starts, which might make
+    //a difference when devices are acting right around a cycle boundary. Using
+    //a < allows things which happen exactly on a cycle boundary to take up only
+    //the following cycle. Anthing that happens later will have to "wait" for
+    //the end of that cycle, and then start using the bus after that.
+    while (tickNextIdle < curTick)
+        tickNextIdle += clock;
+
+    // The packet will be sent. Figure out how long it occupies the bus, and
+    // how much of that time is for the first "word", aka bus width.
+    int numCycles = 0;
+    // Requests need one cycle to send an address
+    if (pkt->isRequest())
+        numCycles++;
+    else if (pkt->isResponse() || pkt->hasData()) {
+        // If a packet has data, it needs ceil(size/width) cycles to send it
+        // We're using the "adding instead of dividing" trick again here
+        if (pkt->hasData()) {
+            int dataSize = pkt->getSize();
+            for (int transmitted = 0; transmitted < dataSize;
+                    transmitted += width) {
+                numCycles++;
+            }
+        } else {
+            // If the packet didn't have data, it must have been a response.
+            // Those use the bus for one cycle to send their data.
+            numCycles++;
+        }
+    }
+
+    // The first word will be delivered after the current tick, the delivery
+    // of the address if any, and one bus cycle to deliver the data
+    pkt->firstWordTime =
+        tickNextIdle +
+        pkt->isRequest() ? clock : 0 +
+        clock;
+
+    //Advance it numCycles bus cycles.
+    //XXX Should this use the repeated addition trick as well?
+    tickNextIdle += (numCycles * clock);
+    if (!busIdle.scheduled()) {
+        busIdle.schedule(tickNextIdle);
+    } else {
+        busIdle.reschedule(tickNextIdle);
+    }
+    DPRINTF(Bus, "The bus is now occupied from tick %d to %d\n",
+            curTick, tickNextIdle);
+
+    // The bus will become idle once the current packet is delivered.
+    pkt->finishTime = tickNextIdle;
+}
 
 /** Function called by the port when the bus is receiving a Timing
  * transaction.*/
@@ -77,17 +144,40 @@ Bus::recvTiming(Packet *pkt)
     DPRINTF(Bus, "recvTiming: packet src %d dest %d addr 0x%x cmd %s\n",
             pkt->getSrc(), pkt->getDest(), pkt->getAddr(), pkt->cmdString());
 
+    BusPort *pktPort;
+    if (pkt->getSrc() == defaultId)
+        pktPort = defaultPort;
+    else pktPort = interfaces[pkt->getSrc()];
+
+    // If the bus is busy, or other devices are in line ahead of the current
+    // one, put this device on the retry list.
+    if (tickNextIdle > curTick ||
+            (retryList.size() && (!inRetry || pktPort != retryList.front()))) {
+        addToRetryList(pktPort);
+        return false;
+    }
+
     short dest = pkt->getDest();
     if (dest == Packet::Broadcast) {
-        if ( timingSnoopPhase1(pkt) )
-        {
-            timingSnoopPhase2(pkt);
+        if (timingSnoop(pkt)) {
+            pkt->flags |= SNOOP_COMMIT;
+            bool success = timingSnoop(pkt);
+            assert(success);
+            if (pkt->flags & SATISFIED) {
+                //Cache-Cache transfer occuring
+                if (inRetry) {
+                    retryList.front()->onRetryList(false);
+                    retryList.pop_front();
+                    inRetry = false;
+                }
+                occupyBus(pkt);
+                return true;
+            }
             port = findPort(pkt->getAddr(), pkt->getSrc());
-        }
-        else
-        {
+        } else {
             //Snoop didn't succeed
-            retryList.push_back(interfaces[pkt->getSrc()]);
+            DPRINTF(Bus, "Adding a retry to RETRY list %i\n", pktPort);
+            addToRetryList(pktPort);
             return false;
         }
     } else {
@@ -95,35 +185,60 @@ Bus::recvTiming(Packet *pkt)
         assert(dest != pkt->getSrc()); // catch infinite loops
         port = interfaces[dest];
     }
+
+    occupyBus(pkt);
+
     if (port->sendTiming(pkt))  {
-        // packet was successfully sent, just return true.
+        // Packet was successfully sent. Return true.
+        // Also take care of retries
+        if (inRetry) {
+            DPRINTF(Bus, "Remove retry from list %i\n", retryList.front());
+            retryList.front()->onRetryList(false);
+            retryList.pop_front();
+            inRetry = false;
+        }
         return true;
     }
 
-    // packet not successfully sent
-    retryList.push_back(interfaces[pkt->getSrc()]);
+    // Packet not successfully sent. Leave or put it on the retry list.
+    DPRINTF(Bus, "Adding a retry to RETRY list %i\n", pktPort);
+    addToRetryList(pktPort);
     return false;
 }
 
 void
 Bus::recvRetry(int id)
 {
-    // Go through all the elements on the list calling sendRetry on each
-    // This is not very efficient at all but it works. Ultimately we should end
-    // up with something that is more intelligent.
-    int initialSize = retryList.size();
-    int i;
-    Port *p;
-
-    for (i = 0; i < initialSize; i++) {
-        assert(retryList.size() > 0);
-        p = retryList.front();
-        retryList.pop_front();
-        p->sendRetry();
+    DPRINTF(Bus, "Received a retry\n");
+    // If there's anything waiting, and the bus isn't busy...
+    if (retryList.size() && curTick >= tickNextIdle) {
+        //retryingPort = retryList.front();
+        inRetry = true;
+        DPRINTF(Bus, "Sending a retry\n");
+        retryList.front()->sendRetry();
+        // If inRetry is still true, sendTiming wasn't called
+        if (inRetry)
+        {
+            retryList.front()->onRetryList(false);
+            retryList.pop_front();
+            inRetry = false;
+
+            //Bring tickNextIdle up to the present
+            while (tickNextIdle < curTick)
+                tickNextIdle += clock;
+
+            //Burn a cycle for the missed grant.
+            tickNextIdle += clock;
+
+            if (!busIdle.scheduled()) {
+                busIdle.schedule(tickNextIdle);
+            } else {
+                busIdle.reschedule(tickNextIdle);
+            }
+        }
     }
 }
 
-
 Port *
 Bus::findPort(Addr addr, int id)
 {
@@ -174,64 +289,60 @@ Bus::findSnoopPorts(Addr addr, int id)
             //Careful  to not overlap ranges
             //or snoop will be called more than once on the port
             ports.push_back(portSnoopList[i].portId);
-            DPRINTF(Bus, "  found snoop addr %#llx on device%d\n", addr,
-                    portSnoopList[i].portId);
+//            DPRINTF(Bus, "  found snoop addr %#llx on device%d\n", addr,
+//                    portSnoopList[i].portId);
         }
         i++;
     }
     return ports;
 }
 
-void
+Tick
 Bus::atomicSnoop(Packet *pkt)
 {
     std::vector<int> ports = findSnoopPorts(pkt->getAddr(), pkt->getSrc());
+    Tick response_time = 0;
 
     while (!ports.empty())
     {
-        interfaces[ports.back()]->sendAtomic(pkt);
+        Tick response = interfaces[ports.back()]->sendAtomic(pkt);
+        if (response) {
+            assert(!response_time);  //Multiple responders
+            response_time = response;
+        }
         ports.pop_back();
     }
+    return response_time;
 }
 
-bool
-Bus::timingSnoopPhase1(Packet *pkt)
+void
+Bus::functionalSnoop(Packet *pkt)
 {
     std::vector<int> ports = findSnoopPorts(pkt->getAddr(), pkt->getSrc());
-    bool success = true;
 
-    while (!ports.empty() && success)
+    while (!ports.empty())
     {
-        snoopCallbacks.push_back(ports.back());
-        success = interfaces[ports.back()]->sendTiming(pkt);
+        interfaces[ports.back()]->sendFunctional(pkt);
         ports.pop_back();
     }
-    if (!success)
-    {
-        while (!snoopCallbacks.empty())
-        {
-            interfaces[snoopCallbacks.back()]->sendStatusChange(Port::SnoopSquash);
-            snoopCallbacks.pop_back();
-        }
-        return false;
-    }
-    return true;
 }
 
-void
-Bus::timingSnoopPhase2(Packet *pkt)
+bool
+Bus::timingSnoop(Packet *pkt)
 {
-    bool success;
-    pkt->flags |= SNOOP_COMMIT;
-    while (!snoopCallbacks.empty())
+    std::vector<int> ports = findSnoopPorts(pkt->getAddr(), pkt->getSrc());
+    bool success = true;
+
+    while (!ports.empty() && success)
     {
-        success = interfaces[snoopCallbacks.back()]->sendTiming(pkt);
-        //We should not fail on snoop callbacks
-        assert(success);
-        snoopCallbacks.pop_back();
+        success = interfaces[ports.back()]->sendTiming(pkt);
+        ports.pop_back();
     }
+
+    return success;
 }
 
+
 /** Function called by the port when the bus is receiving a Atomic
  * transaction.*/
 Tick
@@ -240,8 +351,11 @@ Bus::recvAtomic(Packet *pkt)
     DPRINTF(Bus, "recvAtomic: packet src %d dest %d addr 0x%x cmd %s\n",
             pkt->getSrc(), pkt->getDest(), pkt->getAddr(), pkt->cmdString());
     assert(pkt->getDest() == Packet::Broadcast);
-    atomicSnoop(pkt);
-    return findPort(pkt->getAddr(), pkt->getSrc())->sendAtomic(pkt);
+    Tick snoopTime = atomicSnoop(pkt);
+    if (snoopTime)
+        return snoopTime;  //Snoop satisfies it
+    else
+        return findPort(pkt->getAddr(), pkt->getSrc())->sendAtomic(pkt);
 }
 
 /** Function called by the port when the bus is receiving a Functional
@@ -252,6 +366,7 @@ Bus::recvFunctional(Packet *pkt)
     DPRINTF(Bus, "recvFunctional: packet src %d dest %d addr 0x%x cmd %s\n",
             pkt->getSrc(), pkt->getDest(), pkt->getAddr(), pkt->cmdString());
     assert(pkt->getDest() == Packet::Broadcast);
+    functionalSnoop(pkt);
     findPort(pkt->getAddr(), pkt->getSrc())->sendFunctional(pkt);
 }
 
@@ -280,7 +395,7 @@ Bus::recvStatusChange(Port::Status status, int id)
         }
     } else {
 
-        assert((id < interfaces.size() && id >= 0) || id == -1);
+        assert((id < interfaces.size() && id >= 0) || id == defaultId);
         Port *port = interfaces[id];
         std::vector<DevMap>::iterator portIter;
         std::vector<DevMap>::iterator snoopIter;
@@ -380,16 +495,20 @@ Bus::addressRanges(AddrRangeList &resp, AddrRangeList &snoop, int id)
 BEGIN_DECLARE_SIM_OBJECT_PARAMS(Bus)
 
     Param<int> bus_id;
+    Param<int> clock;
+    Param<int> width;
 
 END_DECLARE_SIM_OBJECT_PARAMS(Bus)
 
 BEGIN_INIT_SIM_OBJECT_PARAMS(Bus)
-    INIT_PARAM(bus_id, "a globally unique bus id")
+    INIT_PARAM(bus_id, "a globally unique bus id"),
+    INIT_PARAM(clock, "bus clock speed"),
+    INIT_PARAM(width, "width of the bus (bits)")
 END_INIT_SIM_OBJECT_PARAMS(Bus)
 
 CREATE_SIM_OBJECT(Bus)
 {
-    return new Bus(getInstanceName(), bus_id);
+    return new Bus(getInstanceName(), bus_id, clock, width);
 }
 
 REGISTER_SIM_OBJECT("Bus", Bus)
diff --git a/src/mem/bus.hh b/src/mem/bus.hh
index 941389296..509b8cf9b 100644
--- a/src/mem/bus.hh
+++ b/src/mem/bus.hh
@@ -46,13 +46,20 @@
 #include "mem/packet.hh"
 #include "mem/port.hh"
 #include "mem/request.hh"
+#include "sim/eventq.hh"
 
 class Bus : public MemObject
 {
     /** a globally unique id for this bus. */
     int busId;
+    /** the clock speed for the bus */
+    int clock;
+    /** the width of the bus in bytes */
+    int width;
+    /** the next tick at which the bus will be idle */
+    Tick tickNextIdle;
 
-    static const int defaultId = -1;
+    static const int defaultId = -3; //Make it unique from Broadcast
 
     struct DevMap {
         int portId;
@@ -62,9 +69,6 @@ class Bus : public MemObject
     AddrRangeList defaultRange;
     std::vector<DevMap> portSnoopList;
 
-    std::vector<int> snoopCallbacks;
-
-
     /** Function called by the port when the bus is recieving a Timing
       transaction.*/
     bool recvTiming(Packet *pkt);
@@ -103,18 +107,16 @@ class Bus : public MemObject
     std::vector<int> findSnoopPorts(Addr addr, int id);
 
     /** Snoop all relevant ports atomicly. */
-    void atomicSnoop(Packet *pkt);
+    Tick atomicSnoop(Packet *pkt);
 
-    /** Snoop for NACK and Blocked in phase 1
-     * @return True if succeds.
-     */
-    bool timingSnoopPhase1(Packet *pkt);
+    /** Snoop all relevant ports functionally. */
+    void functionalSnoop(Packet *pkt);
 
-    /** @todo Don't need to commit all snoops just those that need it
-     *(register somehow). */
-    /** Commit all snoops now that we know if any of them would have blocked.
+    /** Call snoop on caches, be sure to set SNOOP_COMMIT bit if you want
+     * the snoop to happen
+     * @return True if succeds.
      */
-    void timingSnoopPhase2(Packet *pkt);
+    bool timingSnoop(Packet *pkt);
 
     /** Process address range request.
      * @param resp addresses that we can respond to
@@ -123,11 +125,15 @@ class Bus : public MemObject
      */
     void addressRanges(AddrRangeList &resp, AddrRangeList &snoop, int id);
 
+    /** Occupy the bus with transmitting the packet pkt */
+    void occupyBus(PacketPtr pkt);
 
     /** Declaration of the buses port type, one will be instantiated for each
         of the interfaces connecting to the bus. */
     class BusPort : public Port
     {
+        bool _onRetryList;
+
         /** A pointer to the bus to which this port belongs. */
         Bus *bus;
 
@@ -138,9 +144,15 @@ class Bus : public MemObject
 
         /** Constructor for the BusPort.*/
         BusPort(const std::string &_name, Bus *_bus, int _id)
-            : Port(_name), bus(_bus), id(_id)
+            : Port(_name), _onRetryList(false), bus(_bus), id(_id)
         { }
 
+        bool onRetryList()
+        { return _onRetryList; }
+
+        void onRetryList(bool newVal)
+        { _onRetryList = newVal; }
+
       protected:
 
         /** When reciving a timing request from the peer port (at id),
@@ -181,16 +193,52 @@ class Bus : public MemObject
 
     };
 
+    class BusFreeEvent : public Event
+    {
+        Bus * bus;
+
+      public:
+        BusFreeEvent(Bus * _bus);
+        void process();
+        const char *description();
+    };
+
+    BusFreeEvent busIdle;
+
+    bool inRetry;
+
     /** An array of pointers to the peer port interfaces
         connected to this bus.*/
-    std::vector<Port*> interfaces;
+    std::vector<BusPort*> interfaces;
 
     /** An array of pointers to ports that retry should be called on because the
      * original send failed for whatever reason.*/
-    std::list<Port*> retryList;
+    std::list<BusPort*> retryList;
+
+    void addToRetryList(BusPort * port)
+    {
+        if (!inRetry) {
+            // The device wasn't retrying a packet, or wasn't at an appropriate
+            // time.
+            assert(!port->onRetryList());
+            port->onRetryList(true);
+            retryList.push_back(port);
+        } else {
+            if (port->onRetryList()) {
+                // The device was retrying a packet. It didn't work, so we'll leave
+                // it at the head of the retry list.
+                assert(port == retryList.front());
+                inRetry = false;
+            }
+            else {
+                port->onRetryList(true);
+                retryList.push_back(port);
+            }
+        }
+    }
 
     /** Port that handles requests that don't match any of the interfaces.*/
-    Port *defaultPort;
+    BusPort *defaultPort;
 
   public:
 
@@ -199,8 +247,16 @@ class Bus : public MemObject
 
     virtual void init();
 
-    Bus(const std::string &n, int bus_id)
-        : MemObject(n), busId(bus_id), defaultPort(NULL)  {}
+    Bus(const std::string &n, int bus_id, int _clock, int _width)
+        : MemObject(n), busId(bus_id), clock(_clock), width(_width),
+        tickNextIdle(0), busIdle(this), inRetry(false), defaultPort(NULL)
+    {
+        //Both the width and clock period must be positive
+        if (width <= 0)
+            fatal("Bus width must be positive\n");
+        if (clock <= 0)
+            fatal("Bus clock period must be positive\n");
+    }
 
 };
 
diff --git a/src/mem/cache/base_cache.cc b/src/mem/cache/base_cache.cc
index a172847df..71ea58416 100644
--- a/src/mem/cache/base_cache.cc
+++ b/src/mem/cache/base_cache.cc
@@ -44,6 +44,8 @@ BaseCache::CachePort::CachePort(const std::string &_name, BaseCache *_cache,
     : Port(_name), cache(_cache), isCpuSide(_isCpuSide)
 {
     blocked = false;
+    cshrRetry = NULL;
+    waitingOnRetry = false;
     //Start ports at null if more than one is created we should panic
     //cpuSidePort = NULL;
     //memSidePort = NULL;
@@ -71,7 +73,23 @@ BaseCache::CachePort::deviceBlockSize()
 bool
 BaseCache::CachePort::recvTiming(Packet *pkt)
 {
-    if (blocked)
+    if (isCpuSide
+        && !pkt->req->isUncacheable()
+        && pkt->isInvalidate()
+        && !pkt->isRead() && !pkt->isWrite()) {
+        //Upgrade or Invalidate
+        //Look into what happens if two slave caches on bus
+        DPRINTF(Cache, "%s %x ? blk_addr: %x\n", pkt->cmdString(),
+                pkt->getAddr() & (((ULL(1))<<48)-1),
+                pkt->getAddr() & ~((Addr)cache->blkSize - 1));
+
+        assert(!(pkt->flags & SATISFIED));
+        pkt->flags |= SATISFIED;
+        //Invalidates/Upgrades need no response if they get the bus
+        return true;
+    }
+
+    if (pkt->isRequest() && blocked)
     {
         DPRINTF(Cache,"Scheduling a retry while blocked\n");
         mustSendRetry = true;
@@ -96,16 +114,44 @@ void
 BaseCache::CachePort::recvRetry()
 {
     Packet *pkt;
-
-    if (!isCpuSide)
+    assert(waitingOnRetry);
+    if (!drainList.empty()) {
+        DPRINTF(CachePort, "%s attempting to send a retry for response\n", name());
+        //We have some responses to drain first
+        if (sendTiming(drainList.front())) {
+            DPRINTF(CachePort, "%s sucessful in sending a retry for response\n", name());
+            drainList.pop_front();
+            if (!drainList.empty() ||
+                !isCpuSide && cache->doMasterRequest() ||
+                isCpuSide && cache->doSlaveRequest()) {
+
+                DPRINTF(CachePort, "%s has more responses/requests\n", name());
+                BaseCache::CacheEvent * reqCpu = new BaseCache::CacheEvent(this);
+                reqCpu->schedule(curTick + 1);
+            }
+            waitingOnRetry = false;
+        }
+    }
+    else if (!isCpuSide)
     {
+        DPRINTF(CachePort, "%s attempting to send a retry for MSHR\n", name());
+        if (!cache->doMasterRequest()) {
+            //This can happen if I am the owner of a block and see an upgrade
+            //while the block was in my WB Buffers.  I just remove the
+            //wb and de-assert the masterRequest
+            waitingOnRetry = false;
+            return;
+        }
         pkt = cache->getPacket();
+        MSHR* mshr = (MSHR*)pkt->senderState;
         bool success = sendTiming(pkt);
         DPRINTF(Cache, "Address %x was %s in sending the timing request\n",
                 pkt->getAddr(), success ? "succesful" : "unsuccesful");
-        cache->sendResult(pkt, success);
+        cache->sendResult(pkt, mshr, success);
+        waitingOnRetry = !success;
         if (success && cache->doMasterRequest())
         {
+            DPRINTF(CachePort, "%s has more requests\n", name());
             //Still more to issue, rerequest in 1 cycle
             pkt = NULL;
             BaseCache::CacheEvent * reqCpu = new BaseCache::CacheEvent(this);
@@ -114,17 +160,23 @@ BaseCache::CachePort::recvRetry()
     }
     else
     {
-        pkt = cache->getCoherencePacket();
+        assert(cshrRetry);
+        //pkt = cache->getCoherencePacket();
+        //We save the packet, no reordering on CSHRS
+        pkt = cshrRetry;
         bool success = sendTiming(pkt);
+        waitingOnRetry = !success;
         if (success && cache->doSlaveRequest())
         {
             //Still more to issue, rerequest in 1 cycle
             pkt = NULL;
             BaseCache::CacheEvent * reqCpu = new BaseCache::CacheEvent(this);
             reqCpu->schedule(curTick + 1);
+            cshrRetry = NULL;
         }
-
     }
+    if (waitingOnRetry) DPRINTF(CachePort, "%s STILL Waiting on retry\n", name());
+    else DPRINTF(CachePort, "%s no longer waiting on retry\n", name());
     return;
 }
 void
@@ -169,16 +221,47 @@ BaseCache::CacheEvent::process()
 {
     if (!pkt)
     {
-        if (!cachePort->isCpuSide)
-        {
-            //MSHR
+        if (cachePort->waitingOnRetry) return;
+       //We have some responses to drain first
+        if (!cachePort->drainList.empty()) {
+            DPRINTF(CachePort, "%s trying to drain a response\n", cachePort->name());
+            if (cachePort->sendTiming(cachePort->drainList.front())) {
+                DPRINTF(CachePort, "%s drains a response succesfully\n", cachePort->name());
+                cachePort->drainList.pop_front();
+                if (!cachePort->drainList.empty() ||
+                    !cachePort->isCpuSide && cachePort->cache->doMasterRequest() ||
+                    cachePort->isCpuSide && cachePort->cache->doSlaveRequest()) {
+
+                    DPRINTF(CachePort, "%s still has outstanding bus reqs\n", cachePort->name());
+                    this->schedule(curTick + 1);
+                }
+            }
+            else {
+                cachePort->waitingOnRetry = true;
+                DPRINTF(CachePort, "%s now waiting on a retry\n", cachePort->name());
+            }
+        }
+        else if (!cachePort->isCpuSide)
+        {            //MSHR
+            DPRINTF(CachePort, "%s trying to send a MSHR request\n", cachePort->name());
+            if (!cachePort->cache->doMasterRequest()) {
+                //This can happen if I am the owner of a block and see an upgrade
+                //while the block was in my WB Buffers.  I just remove the
+                //wb and de-assert the masterRequest
+                return;
+            }
+
             pkt = cachePort->cache->getPacket();
+            MSHR* mshr = (MSHR*) pkt->senderState;
             bool success = cachePort->sendTiming(pkt);
             DPRINTF(Cache, "Address %x was %s in sending the timing request\n",
                     pkt->getAddr(), success ? "succesful" : "unsuccesful");
-            cachePort->cache->sendResult(pkt, success);
+            cachePort->cache->sendResult(pkt, mshr, success);
+            cachePort->waitingOnRetry = !success;
+            if (cachePort->waitingOnRetry) DPRINTF(CachePort, "%s now waiting on a retry\n", cachePort->name());
             if (success && cachePort->cache->doMasterRequest())
             {
+                DPRINTF(CachePort, "%s still more MSHR requests to send\n", cachePort->name());
                 //Still more to issue, rerequest in 1 cycle
                 pkt = NULL;
                 this->schedule(curTick+1);
@@ -186,10 +269,16 @@ BaseCache::CacheEvent::process()
         }
         else
         {
+            assert(cachePort->cache->doSlaveRequest());
             //CSHR
             pkt = cachePort->cache->getCoherencePacket();
             bool success = cachePort->sendTiming(pkt);
-            if (success && cachePort->cache->doSlaveRequest())
+            if (!success) {
+                //Need to send on a retry
+                cachePort->cshrRetry = pkt;
+                cachePort->waitingOnRetry = true;
+            }
+            else if (cachePort->cache->doSlaveRequest())
             {
                 //Still more to issue, rerequest in 1 cycle
                 pkt = NULL;
@@ -199,8 +288,24 @@ BaseCache::CacheEvent::process()
         return;
     }
     //Response
-    //Know the packet to send, no need to mark in service (must succed)
-    assert(cachePort->sendTiming(pkt));
+    //Know the packet to send
+    if (pkt->flags & NACKED_LINE)
+        pkt->result = Packet::Nacked;
+    else
+        pkt->result = Packet::Success;
+    pkt->makeTimingResponse();
+    DPRINTF(CachePort, "%s attempting to send a response\n", cachePort->name());
+    if (!cachePort->drainList.empty() || cachePort->waitingOnRetry) {
+        //Already have a list, just append
+        cachePort->drainList.push_back(pkt);
+        DPRINTF(CachePort, "%s appending response onto drain list\n", cachePort->name());
+    }
+    else if (!cachePort->sendTiming(pkt)) {
+        //It failed, save it to list of drain events
+        DPRINTF(CachePort, "%s now waiting for a retry\n", cachePort->name());
+        cachePort->drainList.push_back(pkt);
+        cachePort->waitingOnRetry = true;
+    }
 }
 
 const char *
diff --git a/src/mem/cache/base_cache.hh b/src/mem/cache/base_cache.hh
index 069dbab58..563b1ca8b 100644
--- a/src/mem/cache/base_cache.hh
+++ b/src/mem/cache/base_cache.hh
@@ -72,6 +72,7 @@ enum RequestCause{
     Request_PF
 };
 
+class MSHR;
 /**
  * A basic cache interface. Implements some common functions for speed.
  */
@@ -110,6 +111,12 @@ class BaseCache : public MemObject
         bool mustSendRetry;
 
         bool isCpuSide;
+
+        bool waitingOnRetry;
+
+        std::list<Packet *> drainList;
+
+        Packet *cshrRetry;
     };
 
     struct CacheEvent : public Event
@@ -127,6 +134,8 @@ class BaseCache : public MemObject
     CachePort *cpuSidePort;
     CachePort *memSidePort;
 
+    bool snoopRangesSent;
+
   public:
     virtual Port *getPort(const std::string &if_name, int idx = -1);
 
@@ -149,14 +158,15 @@ class BaseCache : public MemObject
 
     void recvStatusChange(Port::Status status, bool isCpuSide)
     {
-        if (status == Port::RangeChange)
-        {
-            if (!isCpuSide)
-            {
+        if (status == Port::RangeChange){
+            if (!isCpuSide) {
                 cpuSidePort->sendStatusChange(Port::RangeChange);
+                if (!snoopRangesSent) {
+                    snoopRangesSent = true;
+                    memSidePort->sendStatusChange(Port::RangeChange);
+                }
             }
-            else
-            {
+            else {
                 memSidePort->sendStatusChange(Port::RangeChange);
             }
         }
@@ -172,7 +182,7 @@ class BaseCache : public MemObject
         fatal("No implementation");
     }
 
-    virtual void sendResult(Packet* &pkt, bool success)
+    virtual void sendResult(Packet* &pkt, MSHR* mshr, bool success)
     {
 
         fatal("No implementation");
@@ -205,6 +215,7 @@ class BaseCache : public MemObject
     /** True if this cache is connected to the CPU. */
     bool topLevelCache;
 
+
     /** Stores time the cache blocked for statistics. */
     Tick blockedCycle;
 
@@ -332,6 +343,7 @@ class BaseCache : public MemObject
         //Start ports at null if more than one is created we should panic
         cpuSidePort = NULL;
         memSidePort = NULL;
+        snoopRangesSent = false;
     }
 
     virtual void init();
@@ -382,9 +394,14 @@ class BaseCache : public MemObject
             blocked_causes[cause]++;
             blockedCycle = curTick;
         }
-        blocked |= flag;
-        DPRINTF(Cache,"Blocking for cause %s\n", cause);
-        cpuSidePort->setBlocked();
+        int old_state = blocked;
+        if (!(blocked & flag)) {
+            //Wasn't already blocked for this cause
+            blocked |= flag;
+            DPRINTF(Cache,"Blocking for cause %s\n", cause);
+            if (!old_state)
+                cpuSidePort->setBlocked();
+        }
     }
 
     /**
@@ -395,8 +412,13 @@ class BaseCache : public MemObject
     void setBlockedForSnoop(BlockedCause cause)
     {
         uint8_t flag = 1 << cause;
-        blockedSnoop |= flag;
-        memSidePort->setBlocked();
+        uint8_t old_state = blockedSnoop;
+        if (!(blockedSnoop & flag)) {
+            //Wasn't already blocked for this cause
+            blockedSnoop |= flag;
+            if (!old_state)
+                memSidePort->setBlocked();
+        }
     }
 
     /**
@@ -445,7 +467,7 @@ class BaseCache : public MemObject
      */
     void setMasterRequest(RequestCause cause, Tick time)
     {
-        if (!doMasterRequest())
+        if (!doMasterRequest() && !memSidePort->waitingOnRetry)
         {
             BaseCache::CacheEvent * reqCpu = new BaseCache::CacheEvent(memSidePort);
             reqCpu->schedule(time);
@@ -503,10 +525,14 @@ class BaseCache : public MemObject
      */
     void respond(Packet *pkt, Tick time)
     {
-        pkt->makeTimingResponse();
-        pkt->result = Packet::Success;
-        CacheEvent *reqCpu = new CacheEvent(cpuSidePort, pkt);
-        reqCpu->schedule(time);
+        if (pkt->needsResponse()) {
+            CacheEvent *reqCpu = new CacheEvent(cpuSidePort, pkt);
+            reqCpu->schedule(time);
+        }
+        else {
+            if (pkt->cmd == Packet::Writeback) delete pkt->req;
+            delete pkt;
+        }
     }
 
     /**
@@ -517,22 +543,29 @@ class BaseCache : public MemObject
     void respondToMiss(Packet *pkt, Tick time)
     {
         if (!pkt->req->isUncacheable()) {
-            missLatency[pkt->cmdToIndex()][pkt->req->getThreadNum()] += time - pkt->time;
+            missLatency[pkt->cmdToIndex()][0/*pkt->req->getThreadNum()*/] += time - pkt->time;
+        }
+        if (pkt->needsResponse()) {
+            CacheEvent *reqCpu = new CacheEvent(cpuSidePort, pkt);
+            reqCpu->schedule(time);
+        }
+        else {
+            if (pkt->cmd == Packet::Writeback) delete pkt->req;
+            delete pkt;
         }
-        pkt->makeTimingResponse();
-        pkt->result = Packet::Success;
-        CacheEvent *reqCpu = new CacheEvent(cpuSidePort, pkt);
-        reqCpu->schedule(time);
     }
 
     /**
      * Suppliess the data if cache to cache transfers are enabled.
      * @param pkt The bus transaction to fulfill.
      */
-    void respondToSnoop(Packet *pkt)
+    void respondToSnoop(Packet *pkt, Tick time)
     {
-        assert("Implement\n" && 0);
+//        assert("Implement\n" && 0);
 //	mi->respond(pkt,curTick + hitLatency);
+        assert (pkt->needsResponse());
+        CacheEvent *reqMem = new CacheEvent(memSidePort, pkt);
+        reqMem->schedule(time);
     }
 
     /**
@@ -551,6 +584,16 @@ class BaseCache : public MemObject
         else
         {
             //This is where snoops get updated
+            AddrRangeList dummy;
+//            if (!topLevelCache)
+//            {
+                cpuSidePort->getPeerAddressRanges(dummy, snoop);
+//            }
+//            else
+//            {
+//                snoop.push_back(RangeSize(0,-1));
+//            }
+
             return;
         }
     }
diff --git a/src/mem/cache/cache.hh b/src/mem/cache/cache.hh
index 989b8743e..41b270030 100644
--- a/src/mem/cache/cache.hh
+++ b/src/mem/cache/cache.hh
@@ -103,6 +103,7 @@ class Cache : public BaseCache
       * Used to append to target list, to cause an invalidation.
       */
     Packet * invalidatePkt;
+    Request *invalidateReq;
 
     /**
      * Temporarily move a block into a MSHR.
@@ -175,7 +176,7 @@ class Cache : public BaseCache
      * @param pkt The request.
      * @param success True if the request was sent successfully.
      */
-    virtual void sendResult(Packet * &pkt, bool success);
+    virtual void sendResult(Packet * &pkt, MSHR* mshr, bool success);
 
     /**
      * Handles a response (cache line fill/write ack) from the bus.
@@ -251,7 +252,7 @@ class Cache : public BaseCache
      * request.
      * @return The estimated completion time.
      */
-    Tick probe(Packet * &pkt, bool update);
+    Tick probe(Packet * &pkt, bool update, CachePort * otherSidePort);
 
     /**
      * Snoop for the provided request in the cache and return the estimated
@@ -262,7 +263,7 @@ class Cache : public BaseCache
      * request.
      * @return The estimated completion time.
      */
-    Tick snoopProbe(Packet * &pkt, bool update);
+    Tick snoopProbe(Packet * &pkt);
 };
 
 #endif // __CACHE_HH__
diff --git a/src/mem/cache/cache_impl.hh b/src/mem/cache/cache_impl.hh
index 11cd84e88..a68418f24 100644
--- a/src/mem/cache/cache_impl.hh
+++ b/src/mem/cache/cache_impl.hh
@@ -51,7 +51,7 @@
 #include "mem/cache/miss/mshr.hh"
 #include "mem/cache/prefetch/prefetcher.hh"
 
-#include "sim/sim_events.hh" // for SimExitEvent
+#include "sim/sim_exit.hh" // for SimExitEvent
 
 template<class TagStore, class Buffering, class Coherence>
 bool
@@ -60,17 +60,21 @@ doTimingAccess(Packet *pkt, CachePort *cachePort, bool isCpuSide)
 {
     if (isCpuSide)
     {
-        if (pkt->isWrite() && (pkt->req->getFlags() & LOCKED)) {
+        if (pkt->isWrite() && (pkt->req->isLocked())) {
             pkt->req->setScResult(1);
         }
         access(pkt);
+
     }
     else
     {
         if (pkt->isResponse())
             handleResponse(pkt);
-        else
-            snoop(pkt);
+        else {
+            //Check if we should do the snoop
+            if (pkt->flags & SNOOP_COMMIT)
+                snoop(pkt);
+        }
     }
     return true;
 }
@@ -83,11 +87,11 @@ doAtomicAccess(Packet *pkt, bool isCpuSide)
     if (isCpuSide)
     {
         //Temporary solution to LL/SC
-        if (pkt->isWrite() && (pkt->req->getFlags() & LOCKED)) {
+        if (pkt->isWrite() && (pkt->req->isLocked())) {
             pkt->req->setScResult(1);
         }
 
-        probe(pkt, true);
+        probe(pkt, true, NULL);
         //TEMP ALWAYS SUCCES FOR NOW
         pkt->result = Packet::Success;
     }
@@ -96,7 +100,7 @@ doAtomicAccess(Packet *pkt, bool isCpuSide)
         if (pkt->isResponse())
             handleResponse(pkt);
         else
-            snoopProbe(pkt, true);
+            return snoopProbe(pkt);
     }
     //Fix this timing info
     return hitLatency;
@@ -113,20 +117,17 @@ doFunctionalAccess(Packet *pkt, bool isCpuSide)
         pkt->req->setThreadContext(0,0);
 
         //Temporary solution to LL/SC
-        if (pkt->isWrite() && (pkt->req->getFlags() & LOCKED)) {
+        if (pkt->isWrite() && (pkt->req->isLocked())) {
             assert("Can't handle LL/SC on functional path\n");
         }
 
-        probe(pkt, true);
+        probe(pkt, false, memSidePort);
         //TEMP ALWAYS SUCCESFUL FOR NOW
         pkt->result = Packet::Success;
     }
     else
     {
-        if (pkt->isResponse())
-            handleResponse(pkt);
-        else
-            snoopProbe(pkt, true);
+            probe(pkt, false, cpuSidePort);
     }
 }
 
@@ -147,7 +148,8 @@ Cache(const std::string &_name,
       prefetchAccess(params.prefetchAccess),
       tags(params.tags), missQueue(params.missQueue),
       coherence(params.coherence), prefetcher(params.prefetcher),
-      doCopy(params.doCopy), blockOnCopy(params.blockOnCopy)
+      doCopy(params.doCopy), blockOnCopy(params.blockOnCopy),
+      hitLatency(params.hitLatency)
 {
 //FIX BUS POINTERS
 //    if (params.in == NULL) {
@@ -162,10 +164,8 @@ Cache(const std::string &_name,
     prefetcher->setCache(this);
     prefetcher->setTags(tags);
     prefetcher->setBuffer(missQueue);
-#if 0
-    invalidatePkt = new Packet;
-    invalidatePkt->cmd = Packet::InvalidateReq;
-#endif
+    invalidateReq = new Request((Addr) NULL, blkSize, 0);
+    invalidatePkt = new Packet(invalidateReq, Packet::InvalidateReq, 0);
 }
 
 template<class TagStore, class Buffering, class Coherence>
@@ -194,20 +194,6 @@ Cache<TagStore,Buffering,Coherence>::access(PacketPtr &pkt)
         prefetcher->handleMiss(pkt, curTick);
     }
     if (!pkt->req->isUncacheable()) {
-        if (pkt->isInvalidate() && !pkt->isRead()
-            && !pkt->isWrite()) {
-            //Upgrade or Invalidate
-            //Look into what happens if two slave caches on bus
-            DPRINTF(Cache, "%s %x ? blk_addr: %x\n", pkt->cmdString(),
-                    pkt->getAddr() & (((ULL(1))<<48)-1),
-                    pkt->getAddr() & ~((Addr)blkSize - 1));
-
-            //@todo Should this return latency have the hit latency in it?
-//	    respond(pkt,curTick+lat);
-            pkt->flags |= SATISFIED;
-//            return MA_HIT; //@todo, return values
-            return true;
-        }
         blk = tags->handleAccess(pkt, lat, writebacks);
     } else {
         size = pkt->getSize();
@@ -234,27 +220,30 @@ Cache<TagStore,Buffering,Coherence>::access(PacketPtr &pkt)
         missQueue->doWriteback(writebacks.front());
         writebacks.pop_front();
     }
-    DPRINTF(Cache, "%s %x %s blk_addr: %x pc %x\n", pkt->cmdString(),
+    DPRINTF(Cache, "%s %x %s blk_addr: %x\n", pkt->cmdString(),
             pkt->getAddr() & (((ULL(1))<<48)-1), (blk) ? "hit" : "miss",
-            pkt->getAddr() & ~((Addr)blkSize - 1), pkt->req->getPC());
+            pkt->getAddr() & ~((Addr)blkSize - 1));
     if (blk) {
         // Hit
-        hits[pkt->cmdToIndex()][pkt->req->getThreadNum()]++;
+        hits[pkt->cmdToIndex()][0/*pkt->req->getThreadNum()*/]++;
         // clear dirty bit if write through
         if (pkt->needsResponse())
             respond(pkt, curTick+lat);
-//	return MA_HIT;
+        if (pkt->cmd == Packet::Writeback) {
+            //Signal that you can kill the pkt/req
+            pkt->flags |= SATISFIED;
+        }
         return true;
     }
 
     // Miss
     if (!pkt->req->isUncacheable()) {
-        misses[pkt->cmdToIndex()][pkt->req->getThreadNum()]++;
+        misses[pkt->cmdToIndex()][0/*pkt->req->getThreadNum()*/]++;
         /** @todo Move miss count code into BaseCache */
         if (missCount) {
             --missCount;
             if (missCount == 0)
-                new SimLoopExitEvent(curTick, "A cache reached the maximum miss count");
+                exitSimLoop("A cache reached the maximum miss count");
         }
     }
     missQueue->handleMiss(pkt, size, curTick + hitLatency);
@@ -267,10 +256,11 @@ template<class TagStore, class Buffering, class Coherence>
 Packet *
 Cache<TagStore,Buffering,Coherence>::getPacket()
 {
+    assert(missQueue->havePending());
     Packet * pkt = missQueue->getPacket();
     if (pkt) {
         if (!pkt->req->isUncacheable()) {
-            if (pkt->cmd == Packet::HardPFReq) misses[Packet::HardPFReq][pkt->req->getThreadNum()]++;
+            if (pkt->cmd == Packet::HardPFReq) misses[Packet::HardPFReq][0/*pkt->req->getThreadNum()*/]++;
             BlkType *blk = tags->findBlock(pkt);
             Packet::Command cmd = coherence->getBusCmd(pkt->cmd,
                                               (blk)? blk->status : 0);
@@ -285,15 +275,30 @@ Cache<TagStore,Buffering,Coherence>::getPacket()
 
 template<class TagStore, class Buffering, class Coherence>
 void
-Cache<TagStore,Buffering,Coherence>::sendResult(PacketPtr &pkt, bool success)
+Cache<TagStore,Buffering,Coherence>::sendResult(PacketPtr &pkt, MSHR* mshr, bool success)
 {
-    if (success) {
-        missQueue->markInService(pkt);
-          //Temp Hack for UPGRADES
-          if (pkt->cmd == Packet::UpgradeReq) {
-              handleResponse(pkt);
-          }
+    if (success && !(pkt->flags & NACKED_LINE)) {
+        missQueue->markInService(pkt, mshr);
+        //Temp Hack for UPGRADES
+        if (pkt->cmd == Packet::UpgradeReq) {
+            pkt->flags &= ~CACHE_LINE_FILL;
+            BlkType *blk = tags->findBlock(pkt);
+            CacheBlk::State old_state = (blk) ? blk->status : 0;
+            CacheBlk::State new_state = coherence->getNewState(pkt,old_state);
+            if (old_state != new_state)
+                DPRINTF(Cache, "Block for blk addr %x moving from state %i to %i\n",
+                        pkt->getAddr() & (((ULL(1))<<48)-1), old_state, new_state);
+            //Set the state on the upgrade
+            memcpy(pkt->getPtr<uint8_t>(), blk->data, blkSize);
+            PacketList writebacks;
+            tags->handleFill(blk, mshr, new_state, writebacks, pkt);
+            assert(writebacks.empty());
+            missQueue->handleResponse(pkt, curTick + hitLatency);
+        }
     } else if (pkt && !pkt->req->isUncacheable()) {
+        pkt->flags &= ~NACKED_LINE;
+        pkt->flags &= ~SATISFIED;
+        pkt->flags &= ~SNOOP_COMMIT;
         missQueue->restoreOrigCmd(pkt);
     }
 }
@@ -304,6 +309,14 @@ Cache<TagStore,Buffering,Coherence>::handleResponse(Packet * &pkt)
 {
     BlkType *blk = NULL;
     if (pkt->senderState) {
+        if (pkt->result == Packet::Nacked) {
+            //pkt->reinitFromRequest();
+            warn("NACKs from devices not connected to the same bus not implemented\n");
+            return;
+        }
+        if (pkt->result == Packet::BadAddress) {
+            //Make the response a Bad address and send it
+        }
 //	MemDebug::cacheResponse(pkt);
         DPRINTF(Cache, "Handling reponse to %x, blk addr: %x\n",pkt->getAddr(),
                 pkt->getAddr() & (((ULL(1))<<48)-1));
@@ -312,11 +325,15 @@ Cache<TagStore,Buffering,Coherence>::handleResponse(Packet * &pkt)
             blk = tags->findBlock(pkt);
             CacheBlk::State old_state = (blk) ? blk->status : 0;
             PacketList writebacks;
+            CacheBlk::State new_state = coherence->getNewState(pkt,old_state);
+            if (old_state != new_state)
+                DPRINTF(Cache, "Block for blk addr %x moving from state %i to %i\n",
+                        pkt->getAddr() & (((ULL(1))<<48)-1), old_state, new_state);
             blk = tags->handleFill(blk, (MSHR*)pkt->senderState,
-                                   coherence->getNewState(pkt,old_state),
-                                   writebacks);
+                                   new_state, writebacks, pkt);
             while (!writebacks.empty()) {
                     missQueue->doWriteback(writebacks.front());
+                    writebacks.pop_front();
             }
         }
         missQueue->handleResponse(pkt, curTick + hitLatency);
@@ -372,7 +389,6 @@ template<class TagStore, class Buffering, class Coherence>
 void
 Cache<TagStore,Buffering,Coherence>::snoop(Packet * &pkt)
 {
-
     Addr blk_addr = pkt->getAddr() & ~(Addr(blkSize-1));
     BlkType *blk = tags->findBlock(pkt);
     MSHR *mshr = missQueue->findMSHR(blk_addr);
@@ -385,7 +401,12 @@ Cache<TagStore,Buffering,Coherence>::snoop(Packet * &pkt)
                     //If the outstanding request was an invalidate (upgrade,readex,..)
                     //Then we need to ACK the request until we get the data
                     //Also NACK if the outstanding request is not a cachefill (writeback)
+                    assert(!(pkt->flags & SATISFIED));
+                    pkt->flags |= SATISFIED;
                     pkt->flags |= NACKED_LINE;
+                    ///@todo NACK's from other levels
+                    //warn("NACKs from devices not connected to the same bus not implemented\n");
+                    //respondToSnoop(pkt, curTick + hitLatency);
                     return;
                 }
                 else {
@@ -398,6 +419,7 @@ Cache<TagStore,Buffering,Coherence>::snoop(Packet * &pkt)
                     //@todo Make it so that a read to a pending read can't be exclusive now.
 
                     //Set the address so find match works
+                    //panic("Don't have invalidates yet\n");
                     invalidatePkt->addrOverride(pkt->getAddr());
 
                     //Append the invalidate on
@@ -420,6 +442,7 @@ Cache<TagStore,Buffering,Coherence>::snoop(Packet * &pkt)
                     if (pkt->isRead()) {
                         //Only Upgrades don't get here
                         //Supply the data
+                        assert(!(pkt->flags & SATISFIED));
                         pkt->flags |= SATISFIED;
 
                         //If we are in an exclusive protocol, make it ask again
@@ -427,18 +450,18 @@ Cache<TagStore,Buffering,Coherence>::snoop(Packet * &pkt)
                         pkt->flags |= SHARED_LINE;
 
                         assert(pkt->isRead());
-                        Addr offset = pkt->getAddr() & ~(blkSize - 1);
+                        Addr offset = pkt->getAddr() & (blkSize - 1);
                         assert(offset < blkSize);
                         assert(pkt->getSize() <= blkSize);
                         assert(offset + pkt->getSize() <=blkSize);
                         memcpy(pkt->getPtr<uint8_t>(), mshr->pkt->getPtr<uint8_t>() + offset, pkt->getSize());
 
-                        respondToSnoop(pkt);
+                        respondToSnoop(pkt, curTick + hitLatency);
                     }
 
                     if (pkt->isInvalidate()) {
                         //This must be an upgrade or other cache will take ownership
-                        missQueue->markInService(mshr->pkt);
+                        missQueue->markInService(mshr->pkt, mshr);
                     }
                     return;
                 }
@@ -448,10 +471,16 @@ Cache<TagStore,Buffering,Coherence>::snoop(Packet * &pkt)
     CacheBlk::State new_state;
     bool satisfy = coherence->handleBusRequest(pkt,blk,mshr, new_state);
     if (satisfy) {
+        DPRINTF(Cache, "Cache snooped a %s request for addr %x and now supplying data,"
+                "new state is %i\n",
+                pkt->cmdString(), blk_addr, new_state);
+
         tags->handleSnoop(blk, new_state, pkt);
-        respondToSnoop(pkt);
+        respondToSnoop(pkt, curTick + hitLatency);
         return;
     }
+    if (blk) DPRINTF(Cache, "Cache snooped a %s request for addr %x, new state is %i\n",
+                     pkt->cmdString(), blk_addr, new_state);
     tags->handleSnoop(blk, new_state);
 }
 
@@ -486,7 +515,7 @@ Cache<TagStore,Buffering,Coherence>::invalidateBlk(Addr addr)
  */
 template<class TagStore, class Buffering, class Coherence>
 Tick
-Cache<TagStore,Buffering,Coherence>::probe(Packet * &pkt, bool update)
+Cache<TagStore,Buffering,Coherence>::probe(Packet * &pkt, bool update, CachePort* otherSidePort)
 {
 //    MemDebug::cacheProbe(pkt);
     if (!pkt->req->isUncacheable()) {
@@ -505,6 +534,10 @@ Cache<TagStore,Buffering,Coherence>::probe(Packet * &pkt, bool update)
     int lat;
     BlkType *blk = tags->handleAccess(pkt, lat, writebacks, update);
 
+    DPRINTF(Cache, "%s %x %s blk_addr: %x\n", pkt->cmdString(),
+            pkt->getAddr() & (((ULL(1))<<48)-1), (blk) ? "hit" : "miss",
+            pkt->getAddr() & ~((Addr)blkSize - 1));
+
     if (!blk) {
         // Need to check for outstanding misses and writes
         Addr blk_addr = pkt->getAddr() & ~(blkSize - 1);
@@ -517,7 +550,8 @@ Cache<TagStore,Buffering,Coherence>::probe(Packet * &pkt, bool update)
         missQueue->findWrites(blk_addr, writes);
 
         if (!update) {
-            memSidePort->sendFunctional(pkt);
+                otherSidePort->sendFunctional(pkt);
+
             // Check for data in MSHR and writebuffer.
             if (mshr) {
                 warn("Found outstanding miss on an non-update probe");
@@ -596,7 +630,7 @@ Cache<TagStore,Buffering,Coherence>::probe(Packet * &pkt, bool update)
             // update the cache state and statistics
             if (mshr || !writes.empty()){
                 // Can't handle it, return pktuest unsatisfied.
-                return 0;
+                panic("Atomic access ran into outstanding MSHR's or WB's!");
             }
             if (!pkt->req->isUncacheable()) {
                 // Fetch the cache block to fill
@@ -610,23 +644,46 @@ Cache<TagStore,Buffering,Coherence>::probe(Packet * &pkt, bool update)
 
                 busPkt->time = curTick;
 
+                DPRINTF(Cache, "Sending a atomic %s for %x blk_addr: %x\n",
+                        busPkt->cmdString(),
+                        busPkt->getAddr() & (((ULL(1))<<48)-1),
+                        busPkt->getAddr() & ~((Addr)blkSize - 1));
+
                 lat = memSidePort->sendAtomic(busPkt);
 
+                //Be sure to flip the response to a request for coherence
+                if (busPkt->needsResponse()) {
+                    busPkt->makeAtomicResponse();
+                }
+
 /*		if (!(busPkt->flags & SATISFIED)) {
                     // blocked at a higher level, just return
                     return 0;
                 }
 
-*/		misses[pkt->cmdToIndex()][pkt->req->getThreadNum()]++;
+*/		misses[pkt->cmdToIndex()][0/*pkt->req->getThreadNum()*/]++;
 
                 CacheBlk::State old_state = (blk) ? blk->status : 0;
+                CacheBlk::State new_state = coherence->getNewState(busPkt, old_state);
+                    DPRINTF(Cache, "Receive response:%s for blk addr %x in state %i\n",
+                            busPkt->cmdString(),
+                            busPkt->getAddr() & (((ULL(1))<<48)-1), old_state);
+                if (old_state != new_state)
+                    DPRINTF(Cache, "Block for blk addr %x moving from state %i to %i\n",
+                            busPkt->getAddr() & (((ULL(1))<<48)-1), old_state, new_state);
+
                 tags->handleFill(blk, busPkt,
-                                 coherence->getNewState(busPkt, old_state),
+                                 new_state,
                                  writebacks, pkt);
+                //Free the packet
+                delete busPkt;
+
                 // Handle writebacks if needed
                 while (!writebacks.empty()){
-                    memSidePort->sendAtomic(writebacks.front());
+                    Packet *wbPkt = writebacks.front();
+                    memSidePort->sendAtomic(wbPkt);
                     writebacks.pop_front();
+                    delete wbPkt;
                 }
                 return lat + hitLatency;
             } else {
@@ -642,12 +699,12 @@ Cache<TagStore,Buffering,Coherence>::probe(Packet * &pkt, bool update)
         }
 
         if (update) {
-            hits[pkt->cmdToIndex()][pkt->req->getThreadNum()]++;
+            hits[pkt->cmdToIndex()][0/*pkt->req->getThreadNum()*/]++;
         } else if (pkt->isWrite()) {
             // Still need to change data in all locations.
-            return memSidePort->sendAtomic(pkt);
+            otherSidePort->sendFunctional(pkt);
         }
-        return curTick + lat;
+        return hitLatency;
     }
     fatal("Probe not handled.\n");
     return 0;
@@ -655,18 +712,24 @@ Cache<TagStore,Buffering,Coherence>::probe(Packet * &pkt, bool update)
 
 template<class TagStore, class Buffering, class Coherence>
 Tick
-Cache<TagStore,Buffering,Coherence>::snoopProbe(PacketPtr &pkt, bool update)
+Cache<TagStore,Buffering,Coherence>::snoopProbe(PacketPtr &pkt)
 {
-    Addr blk_addr = pkt->getAddr() & ~(Addr(blkSize-1));
-    BlkType *blk = tags->findBlock(pkt);
-    MSHR *mshr = missQueue->findMSHR(blk_addr);
-    CacheBlk::State new_state = 0;
-    bool satisfy = coherence->handleBusRequest(pkt,blk,mshr, new_state);
-    if (satisfy) {
-        tags->handleSnoop(blk, new_state, pkt);
-        return hitLatency;
-    }
-    tags->handleSnoop(blk, new_state);
-    return 0;
+        Addr blk_addr = pkt->getAddr() & ~(Addr(blkSize-1));
+        BlkType *blk = tags->findBlock(pkt);
+        MSHR *mshr = missQueue->findMSHR(blk_addr);
+        CacheBlk::State new_state = 0;
+        bool satisfy = coherence->handleBusRequest(pkt,blk,mshr, new_state);
+        if (satisfy) {
+            DPRINTF(Cache, "Cache snooped a %s request for addr %x and now supplying data,"
+                    "new state is %i\n",
+                    pkt->cmdString(), blk_addr, new_state);
+
+            tags->handleSnoop(blk, new_state, pkt);
+            return hitLatency;
+        }
+        if (blk) DPRINTF(Cache, "Cache snooped a %s request for addr %x, new state is %i\n",
+                     pkt->cmdString(), blk_addr, new_state);
+        tags->handleSnoop(blk, new_state);
+        return 0;
 }
 
diff --git a/src/mem/cache/coherence/coherence_protocol.cc b/src/mem/cache/coherence/coherence_protocol.cc
index bcf3ce9c5..e28dda3dc 100644
--- a/src/mem/cache/coherence/coherence_protocol.cc
+++ b/src/mem/cache/coherence/coherence_protocol.cc
@@ -271,7 +271,7 @@ CoherenceProtocol::CoherenceProtocol(const string &name,
     }
 
     Packet::Command writeToSharedCmd = doUpgrades ? Packet::UpgradeReq : Packet::ReadExReq;
-    Packet::Command writeToSharedResp = doUpgrades ? Packet::UpgradeResp : Packet::ReadExResp;
+    Packet::Command writeToSharedResp = doUpgrades ? Packet::UpgradeReq : Packet::ReadExResp;
 
 //@todo add in hardware prefetch to this list
     if (protocol == "msi") {
diff --git a/src/mem/cache/miss/blocking_buffer.cc b/src/mem/cache/miss/blocking_buffer.cc
index 67fc7ae56..f7aacff89 100644
--- a/src/mem/cache/miss/blocking_buffer.cc
+++ b/src/mem/cache/miss/blocking_buffer.cc
@@ -123,12 +123,12 @@ BlockingBuffer::restoreOrigCmd(Packet * &pkt)
 }
 
 void
-BlockingBuffer::markInService(Packet * &pkt)
+BlockingBuffer::markInService(Packet * &pkt, MSHR* mshr)
 {
     if (!pkt->isCacheFill() && pkt->isWrite()) {
         // Forwarding a write/ writeback, don't need to change
         // the command
-        assert((MSHR*)pkt->senderState == &wb);
+        assert(mshr == &wb);
         cache->clearMasterRequest(Request_WB);
         if (!pkt->needsResponse()) {
             assert(wb.getNumTargets() == 0);
@@ -138,7 +138,7 @@ BlockingBuffer::markInService(Packet * &pkt)
             wb.inService = true;
         }
     } else {
-        assert((MSHR*)pkt->senderState == &miss);
+        assert(mshr == &miss);
         cache->clearMasterRequest(Request_MSHR);
         if (!pkt->needsResponse()) {
             assert(miss.getNumTargets() == 0);
@@ -189,7 +189,7 @@ BlockingBuffer::squash(int threadNum)
     if (miss.threadNum == threadNum) {
         Packet * target = miss.getTarget();
         miss.popTarget();
-        assert(target->req->getThreadNum() == threadNum);
+        assert(0/*target->req->getThreadNum()*/ == threadNum);
         target = NULL;
         assert(!miss.hasTargets());
         miss.ntargets=0;
@@ -218,7 +218,7 @@ BlockingBuffer::doWriteback(Addr addr,
     }
 
     ///All writebacks charged to same thread @todo figure this out
-    writebacks[pkt->req->getThreadNum()]++;
+    writebacks[0/*pkt->req->getThreadNum()*/]++;
 
     wb.allocateAsBuffer(pkt);
     cache->setMasterRequest(Request_WB, curTick);
@@ -230,7 +230,7 @@ BlockingBuffer::doWriteback(Addr addr,
 void
 BlockingBuffer::doWriteback(Packet * &pkt)
 {
-    writebacks[pkt->req->getThreadNum()]++;
+    writebacks[0/*pkt->req->getThreadNum()*/]++;
 
     wb.allocateAsBuffer(pkt);
 
diff --git a/src/mem/cache/miss/blocking_buffer.hh b/src/mem/cache/miss/blocking_buffer.hh
index 641d5a798..f7069696c 100644
--- a/src/mem/cache/miss/blocking_buffer.hh
+++ b/src/mem/cache/miss/blocking_buffer.hh
@@ -152,7 +152,7 @@ public:
      * are successfully sent.
      * @param pkt The request that was sent on the bus.
      */
-    void markInService(Packet * &pkt);
+    void markInService(Packet * &pkt, MSHR* mshr);
 
     /**
      * Frees the resources of the pktuest and unblock the cache.
diff --git a/src/mem/cache/miss/miss_queue.cc b/src/mem/cache/miss/miss_queue.cc
index 76fb25716..c7b0e0890 100644
--- a/src/mem/cache/miss/miss_queue.cc
+++ b/src/mem/cache/miss/miss_queue.cc
@@ -372,7 +372,7 @@ MissQueue::allocateMiss(Packet * &pkt, int size, Tick time)
 MSHR*
 MissQueue::allocateWrite(Packet * &pkt, int size, Tick time)
 {
-    MSHR* mshr = wb.allocate(pkt,blkSize);
+    MSHR* mshr = wb.allocate(pkt,size);
     mshr->order = order++;
 
 //REMOVING COMPRESSION FOR NOW
@@ -413,8 +413,8 @@ MissQueue::handleMiss(Packet * &pkt, int blkSize, Tick time)
         mshr = mq.findMatch(blkAddr);
         if (mshr) {
             //@todo remove hw_pf here
-            mshr_hits[pkt->cmdToIndex()][pkt->req->getThreadNum()]++;
-            if (mshr->threadNum != pkt->req->getThreadNum()) {
+            mshr_hits[pkt->cmdToIndex()][0/*pkt->req->getThreadNum()*/]++;
+            if (mshr->threadNum != 0/*pkt->req->getThreadNum()*/) {
                 mshr->threadNum = -1;
             }
             mq.allocateTarget(mshr, pkt);
@@ -434,11 +434,11 @@ MissQueue::handleMiss(Packet * &pkt, int blkSize, Tick time)
             mshr_no_allocate_misses++;
         }
         else {
-            mshr_misses[pkt->cmdToIndex()][pkt->req->getThreadNum()]++;
+            mshr_misses[pkt->cmdToIndex()][0/*pkt->req->getThreadNum()*/]++;
         }
     } else {
         //Count uncacheable accesses
-        mshr_uncacheable[pkt->cmdToIndex()][pkt->req->getThreadNum()]++;
+        mshr_uncacheable[pkt->cmdToIndex()][0/*pkt->req->getThreadNum()*/]++;
         size = pkt->getSize();
     }
     if (pkt->isWrite() && (pkt->req->isUncacheable() || !writeAllocate ||
@@ -446,7 +446,7 @@ MissQueue::handleMiss(Packet * &pkt, int blkSize, Tick time)
         /**
          * @todo Add write merging here.
          */
-        mshr = allocateWrite(pkt, blkSize, time);
+        mshr = allocateWrite(pkt, pkt->getSize(), time);
         return;
     }
 
@@ -499,7 +499,7 @@ MissQueue::getPacket()
         pkt = prefetcher->getPacket();
         if (pkt) {
             //Update statistic on number of prefetches issued (hwpf_mshr_misses)
-            mshr_misses[pkt->cmdToIndex()][pkt->req->getThreadNum()]++;
+            mshr_misses[pkt->cmdToIndex()][0/*pkt->req->getThreadNum()*/]++;
             //It will request the bus for the future, but should clear that immedieatley
             allocateMiss(pkt, pkt->getSize(), curTick);
             pkt = mq.getReq();
@@ -515,6 +515,14 @@ MissQueue::setBusCmd(Packet * &pkt, Packet::Command cmd)
     assert(pkt->senderState != 0);
     MSHR * mshr = (MSHR*)pkt->senderState;
     mshr->originalCmd = pkt->cmd;
+    if (cmd == Packet::UpgradeReq || cmd == Packet::InvalidateReq) {
+        pkt->flags |= NO_ALLOCATE;
+        pkt->flags &= ~CACHE_LINE_FILL;
+    }
+    else if (!pkt->req->isUncacheable() && !pkt->isNoAllocate() &&
+             (cmd & (1 << 6)/*NeedsResponse*/)) {
+        pkt->flags |= CACHE_LINE_FILL;
+    }
     if (pkt->isCacheFill() || pkt->isNoAllocate())
         pkt->cmd = cmd;
 }
@@ -526,9 +534,8 @@ MissQueue::restoreOrigCmd(Packet * &pkt)
 }
 
 void
-MissQueue::markInService(Packet * &pkt)
+MissQueue::markInService(Packet * &pkt, MSHR* mshr)
 {
-    assert(pkt->senderState != 0);
     bool unblock = false;
     BlockedCause cause = NUM_BLOCKED_CAUSES;
 
@@ -540,7 +547,7 @@ MissQueue::markInService(Packet * &pkt)
         // Forwarding a write/ writeback, don't need to change
         // the command
         unblock = wb.isFull();
-        wb.markInService((MSHR*)pkt->senderState);
+        wb.markInService(mshr);
         if (!wb.havePending()){
             cache->clearMasterRequest(Request_WB);
         }
@@ -551,11 +558,11 @@ MissQueue::markInService(Packet * &pkt)
         }
     } else {
         unblock = mq.isFull();
-        mq.markInService((MSHR*)pkt->senderState);
+        mq.markInService(mshr);
         if (!mq.havePending()){
             cache->clearMasterRequest(Request_MSHR);
         }
-        if (((MSHR*)(pkt->senderState))->originalCmd == Packet::HardPFReq) {
+        if (mshr->originalCmd == Packet::HardPFReq) {
             DPRINTF(HWPrefetch, "%s:Marking a HW_PF in service\n",
                     cache->name());
             //Also clear pending if need be
@@ -592,7 +599,7 @@ MissQueue::handleResponse(Packet * &pkt, Tick time)
     BlockedCause cause = NUM_BLOCKED_CAUSES;
 
     if (pkt->isCacheFill() && !pkt->isNoAllocate()) {
-        mshr_miss_latency[mshr->originalCmd][pkt->req->getThreadNum()] +=
+        mshr_miss_latency[mshr->originalCmd][0/*pkt->req->getThreadNum()*/] +=
             curTick - pkt->time;
         // targets were handled in the cache tags
         if (mshr == noTargetMSHR) {
@@ -619,7 +626,7 @@ MissQueue::handleResponse(Packet * &pkt, Tick time)
         }
     } else {
         if (pkt->req->isUncacheable()) {
-            mshr_uncacheable_lat[pkt->cmd][pkt->req->getThreadNum()] +=
+            mshr_uncacheable_lat[pkt->cmd][0/*pkt->req->getThreadNum()*/] +=
                 curTick - pkt->time;
         }
         if (mshr->hasTargets() && pkt->req->isUncacheable()) {
@@ -725,7 +732,7 @@ MissQueue::doWriteback(Addr addr,
     }
 
     ///All writebacks charged to same thread @todo figure this out
-    writebacks[pkt->req->getThreadNum()]++;
+    writebacks[0/*pkt->req->getThreadNum()*/]++;
 
     allocateWrite(pkt, 0, curTick);
 }
@@ -734,7 +741,7 @@ MissQueue::doWriteback(Addr addr,
 void
 MissQueue::doWriteback(Packet * &pkt)
 {
-    writebacks[pkt->req->getThreadNum()]++;
+    writebacks[0/*pkt->req->getThreadNum()*/]++;
     allocateWrite(pkt, 0, curTick);
 }
 
diff --git a/src/mem/cache/miss/miss_queue.hh b/src/mem/cache/miss/miss_queue.hh
index 505d1f90c..179638d2b 100644
--- a/src/mem/cache/miss/miss_queue.hh
+++ b/src/mem/cache/miss/miss_queue.hh
@@ -256,7 +256,7 @@ class MissQueue
      * are successfully sent.
      * @param pkt The request that was sent on the bus.
      */
-    void markInService(Packet * &pkt);
+    void markInService(Packet * &pkt, MSHR* mshr);
 
     /**
      * Collect statistics and free resources of a satisfied pktuest.
diff --git a/src/mem/cache/miss/mshr.cc b/src/mem/cache/miss/mshr.cc
index 519ec5ebd..455798f15 100644
--- a/src/mem/cache/miss/mshr.cc
+++ b/src/mem/cache/miss/mshr.cc
@@ -88,7 +88,7 @@ void
 MSHR::allocateAsBuffer(Packet * &target)
 {
     addr = target->getAddr();
-    threadNum = target->req->getThreadNum();
+    threadNum = 0/*target->req->getThreadNum()*/;
     pkt = new Packet(target->req, target->cmd, -1);
     pkt->allocate();
     pkt->senderState = (Packet::SenderState*)this;
@@ -100,6 +100,7 @@ MSHR::deallocate()
 {
     assert(targets.empty());
     assert(ntargets == 0);
+    delete pkt;
     pkt = NULL;
     inService = false;
     //allocIter = NULL;
diff --git a/src/mem/cache/miss/mshr_queue.cc b/src/mem/cache/miss/mshr_queue.cc
index 97a56119f..1876a8987 100644
--- a/src/mem/cache/miss/mshr_queue.cc
+++ b/src/mem/cache/miss/mshr_queue.cc
@@ -128,6 +128,7 @@ MSHR*
 MSHRQueue::allocate(Packet * &pkt, int size)
 {
     Addr aligned_addr = pkt->getAddr() & ~((Addr)size - 1);
+    assert(!freeList.empty());
     MSHR *mshr = freeList.front();
     assert(mshr->getNumTargets() == 0);
     freeList.pop_front();
@@ -212,8 +213,13 @@ void
 MSHRQueue::markInService(MSHR* mshr)
 {
     //assert(mshr == pendingList.front());
-    if (!mshr->pkt->needsResponse()) {
+    if (!(mshr->pkt->needsResponse() || mshr->pkt->cmd == Packet::UpgradeReq)) {
         assert(mshr->getNumTargets() == 0);
+        if ((mshr->pkt->flags & SATISFIED) && (mshr->pkt->cmd == Packet::Writeback)) {
+            //Writeback hit, so delete it
+            //otherwise the consumer will delete it
+            delete mshr->pkt->req;
+        }
         deallocate(mshr);
         return;
     }
@@ -251,7 +257,7 @@ MSHRQueue::squash(int threadNum)
                 Packet * target = mshr->getTarget();
                 mshr->popTarget();
 
-                assert(target->req->getThreadNum() == threadNum);
+                assert(0/*target->req->getThreadNum()*/ == threadNum);
                 target = NULL;
             }
             assert(!mshr->hasTargets());
diff --git a/src/mem/packet.cc b/src/mem/packet.cc
index 91298df8c..4758fda89 100644
--- a/src/mem/packet.cc
+++ b/src/mem/packet.cc
@@ -39,9 +39,18 @@
 
 static const std::string ReadReqString("ReadReq");
 static const std::string WriteReqString("WriteReq");
-static const std::string WriteReqNoAckString("WriteReqNoAck");
+static const std::string WriteReqNoAckString("WriteReqNoAck|Writeback");
 static const std::string ReadRespString("ReadResp");
 static const std::string WriteRespString("WriteResp");
+static const std::string SoftPFReqString("SoftPFReq");
+static const std::string SoftPFRespString("SoftPFResp");
+static const std::string HardPFReqString("HardPFReq");
+static const std::string HardPFRespString("HardPFResp");
+static const std::string InvalidateReqString("InvalidateReq");
+static const std::string WriteInvalidateReqString("WriteInvalidateReq");
+static const std::string UpgradeReqString("UpgradeReq");
+static const std::string ReadExReqString("ReadExReq");
+static const std::string ReadExRespString("ReadExResp");
 static const std::string OtherCmdString("<other>");
 
 const std::string &
@@ -53,6 +62,15 @@ Packet::cmdString() const
       case WriteReqNoAck:   return WriteReqNoAckString;
       case ReadResp:        return ReadRespString;
       case WriteResp:       return WriteRespString;
+      case SoftPFReq:       return SoftPFReqString;
+      case SoftPFResp:      return SoftPFRespString;
+      case HardPFReq:       return HardPFReqString;
+      case HardPFResp:      return HardPFRespString;
+      case InvalidateReq:   return InvalidateReqString;
+      case WriteInvalidateReq:return WriteInvalidateReqString;
+      case UpgradeReq:      return UpgradeReqString;
+      case ReadExReq:       return ReadExReqString;
+      case ReadExResp:      return ReadExRespString;
       default:              return OtherCmdString;
     }
 }
@@ -66,6 +84,15 @@ Packet::cmdIdxToString(Packet::Command idx)
       case WriteReqNoAck:   return WriteReqNoAckString;
       case ReadResp:        return ReadRespString;
       case WriteResp:       return WriteRespString;
+      case SoftPFReq:       return SoftPFReqString;
+      case SoftPFResp:      return SoftPFRespString;
+      case HardPFReq:       return HardPFReqString;
+      case HardPFResp:      return HardPFRespString;
+      case InvalidateReq:   return InvalidateReqString;
+      case WriteInvalidateReq:return WriteInvalidateReqString;
+      case UpgradeReq:      return UpgradeReqString;
+      case ReadExReq:       return ReadExReqString;
+      case ReadExResp:      return ReadExRespString;
       default:              return OtherCmdString;
     }
 }
@@ -102,15 +129,11 @@ bool
 Packet::intersect(Packet *p)
 {
     Addr s1 = getAddr();
-    Addr e1 = getAddr() + getSize();
+    Addr e1 = getAddr() + getSize() - 1;
     Addr s2 = p->getAddr();
-    Addr e2 = p->getAddr() + p->getSize();
+    Addr e2 = p->getAddr() + p->getSize() - 1;
 
-    if (s1 >= s2 && s1 < e2)
-        return true;
-    if (e1 >= s2 && e1 < e2)
-        return true;
-    return false;
+    return !(s1 > e2 || e1 < s2);
 }
 
 bool
diff --git a/src/mem/packet.hh b/src/mem/packet.hh
index c7d28010c..7ede48bfd 100644
--- a/src/mem/packet.hh
+++ b/src/mem/packet.hh
@@ -58,10 +58,8 @@ typedef std::list<PacketPtr> PacketList;
 #define NO_ALLOCATE 1 << 5
 #define SNOOP_COMMIT 1 << 6
 
-//For statistics we need max number of commands, hard code it at
-//20 for now.  @todo fix later
-#define NUM_MEM_CMDS 1 << 9
-
+//for now.  @todo fix later
+#define NUM_MEM_CMDS 1 << 11
 /**
  * A Packet is used to encapsulate a transfer between two objects in
  * the memory system (e.g., the L1 and L2 cache).  (In contrast, a
@@ -94,7 +92,6 @@ class Packet
      *   be called on it rather than simply delete.*/
     bool arrayData;
 
-
     /** The address of the request.  This address could be virtual or
      *   physical, depending on the system configuration. */
     Addr addr;
@@ -126,6 +123,12 @@ class Packet
     /** Used to calculate latencies for each packet.*/
     Tick time;
 
+    /** The time at which the packet will be fully transmitted */
+    Tick finishTime;
+
+    /** The time at which the first chunk of the packet will be transmitted */
+    Tick firstWordTime;
+
     /** The special destination address indicating that the packet
      *   should be routed based on its address. */
     static const short Broadcast = -1;
@@ -164,6 +167,8 @@ class Packet
 
   private:
     /** List of command attributes. */
+    // If you add a new CommandAttribute, make sure to increase NUM_MEM_CMDS
+    // as well.
     enum CommandAttribute
     {
         IsRead		= 1 << 0,
@@ -174,7 +179,9 @@ class Packet
         IsResponse 	= 1 << 5,
         NeedsResponse	= 1 << 6,
         IsSWPrefetch    = 1 << 7,
-        IsHWPrefetch    = 1 << 8
+        IsHWPrefetch    = 1 << 8,
+        IsUpgrade       = 1 << 9,
+        HasData		= 1 << 10
     };
 
   public:
@@ -183,21 +190,23 @@ class Packet
     {
         InvalidCmd      = 0,
         ReadReq		= IsRead  | IsRequest | NeedsResponse,
-        WriteReq	= IsWrite | IsRequest | NeedsResponse,
-        WriteReqNoAck	= IsWrite | IsRequest,
-        ReadResp	= IsRead  | IsResponse | NeedsResponse,
+        WriteReq	= IsWrite | IsRequest | NeedsResponse | HasData,
+        WriteReqNoAck	= IsWrite | IsRequest | HasData,
+        ReadResp	= IsRead  | IsResponse | NeedsResponse | HasData,
         WriteResp	= IsWrite | IsResponse | NeedsResponse,
-        Writeback       = IsWrite | IsRequest,
+        Writeback       = IsWrite | IsRequest | HasData,
         SoftPFReq       = IsRead  | IsRequest | IsSWPrefetch | NeedsResponse,
         HardPFReq       = IsRead  | IsRequest | IsHWPrefetch | NeedsResponse,
-        SoftPFResp      = IsRead  | IsResponse | IsSWPrefetch | NeedsResponse,
-        HardPFResp      = IsRead  | IsResponse | IsHWPrefetch | NeedsResponse,
+        SoftPFResp      = IsRead  | IsResponse | IsSWPrefetch
+                                | NeedsResponse | HasData,
+        HardPFResp      = IsRead  | IsResponse | IsHWPrefetch
+                                | NeedsResponse | HasData,
         InvalidateReq   = IsInvalidate | IsRequest,
-        WriteInvalidateReq = IsWrite | IsInvalidate | IsRequest,
-        UpgradeReq      = IsInvalidate | IsRequest | NeedsResponse,
-        UpgradeResp     = IsInvalidate | IsResponse | NeedsResponse,
+        WriteInvalidateReq = IsWrite | IsInvalidate | IsRequest | HasData,
+        UpgradeReq      = IsInvalidate | IsRequest | IsUpgrade,
         ReadExReq       = IsRead | IsInvalidate | IsRequest | NeedsResponse,
-        ReadExResp      = IsRead | IsInvalidate | IsResponse | NeedsResponse
+        ReadExResp      = IsRead | IsInvalidate | IsResponse
+                                | NeedsResponse | HasData
     };
 
     /** Return the string name of the cmd field (for debugging and
@@ -219,6 +228,7 @@ class Packet
     bool isResponse()	 { return (cmd & IsResponse) != 0; }
     bool needsResponse() { return (cmd & NeedsResponse) != 0; }
     bool isInvalidate()  { return (cmd & IsInvalidate) != 0; }
+    bool hasData()	 { return (cmd & HasData) != 0; }
 
     bool isCacheFill() { return (flags & CACHE_LINE_FILL) != 0; }
     bool isNoAllocate() { return (flags & NO_ALLOCATE) != 0; }
@@ -312,7 +322,7 @@ class Packet
      *   for returning as a response to that request.  Used for timing
      *   accesses only.  For atomic and functional accesses, the
      *   request packet is always implicitly passed back *without*
-     *   modifying the command or destination fields, so this function
+     *   modifying the destination fields, so this function
      *   should not be called. */
     void makeTimingResponse() {
         assert(needsResponse());
@@ -320,11 +330,31 @@ class Packet
         int icmd = (int)cmd;
         icmd &= ~(IsRequest);
         icmd |= IsResponse;
+        if (isRead())
+            icmd |= HasData;
+        if (isWrite())
+            icmd &= ~HasData;
         cmd = (Command)icmd;
         dest = src;
         srcValid = false;
     }
 
+    /** Take a request packet and modify it in place to be suitable
+     *   for returning as a response to that request.
+     */
+    void makeAtomicResponse() {
+        assert(needsResponse());
+        assert(isRequest());
+        int icmd = (int)cmd;
+        icmd &= ~(IsRequest);
+        icmd |= IsResponse;
+        if (isRead())
+            icmd |= HasData;
+        if (isWrite())
+            icmd &= ~HasData;
+        cmd = (Command)icmd;
+    }
+
     /** Take a request packet that has been returned as NACKED and modify it so
      * that it can be sent out again. Only packets that need a response can be
      * NACKED, so verify that that is true. */
diff --git a/src/mem/physical.cc b/src/mem/physical.cc
index 8fea733ec..7303f278e 100644
--- a/src/mem/physical.cc
+++ b/src/mem/physical.cc
@@ -110,28 +110,112 @@ PhysicalMemory::calculateLatency(Packet *pkt)
     return lat;
 }
 
+
+
+// Add load-locked to tracking list.  Should only be called if the
+// operation is a load and the LOCKED flag is set.
+void
+PhysicalMemory::trackLoadLocked(Request *req)
+{
+    Addr paddr = LockedAddr::mask(req->getPaddr());
+
+    // first we check if we already have a locked addr for this
+    // xc.  Since each xc only gets one, we just update the
+    // existing record with the new address.
+    list<LockedAddr>::iterator i;
+
+    for (i = lockedAddrList.begin(); i != lockedAddrList.end(); ++i) {
+        if (i->matchesContext(req)) {
+            DPRINTF(LLSC, "Modifying lock record: cpu %d thread %d addr %#x\n",
+                    req->getCpuNum(), req->getThreadNum(), paddr);
+            i->addr = paddr;
+            return;
+        }
+    }
+
+    // no record for this xc: need to allocate a new one
+    DPRINTF(LLSC, "Adding lock record: cpu %d thread %d addr %#x\n",
+            req->getCpuNum(), req->getThreadNum(), paddr);
+    lockedAddrList.push_front(LockedAddr(req));
+}
+
+
+// Called on *writes* only... both regular stores and
+// store-conditional operations.  Check for conventional stores which
+// conflict with locked addresses, and for success/failure of store
+// conditionals.
+bool
+PhysicalMemory::checkLockedAddrList(Request *req)
+{
+    Addr paddr = LockedAddr::mask(req->getPaddr());
+    bool isLocked = req->isLocked();
+
+    // Initialize return value.  Non-conditional stores always
+    // succeed.  Assume conditional stores will fail until proven
+    // otherwise.
+    bool success = !isLocked;
+
+    // Iterate over list.  Note that there could be multiple matching
+    // records, as more than one context could have done a load locked
+    // to this location.
+    list<LockedAddr>::iterator i = lockedAddrList.begin();
+
+    while (i != lockedAddrList.end()) {
+
+        if (i->addr == paddr) {
+            // we have a matching address
+
+            if (isLocked && i->matchesContext(req)) {
+                // it's a store conditional, and as far as the memory
+                // system can tell, the requesting context's lock is
+                // still valid.
+                DPRINTF(LLSC, "StCond success: cpu %d thread %d addr %#x\n",
+                        req->getCpuNum(), req->getThreadNum(), paddr);
+                success = true;
+            }
+
+            // Get rid of our record of this lock and advance to next
+            DPRINTF(LLSC, "Erasing lock record: cpu %d thread %d addr %#x\n",
+                    i->cpuNum, i->threadNum, paddr);
+            i = lockedAddrList.erase(i);
+        }
+        else {
+            // no match: advance to next record
+            ++i;
+        }
+    }
+
+    if (isLocked) {
+        req->setScResult(success ? 1 : 0);
+    }
+
+    return success;
+}
+
 void
 PhysicalMemory::doFunctionalAccess(Packet *pkt)
 {
-    assert(pkt->getAddr() + pkt->getSize() < params()->addrRange.size());
+    assert(pkt->getAddr() + pkt->getSize() <= params()->addrRange.size());
 
-    switch (pkt->cmd) {
-      case Packet::ReadReq:
+    if (pkt->isRead()) {
+        if (pkt->req->isLocked()) {
+            trackLoadLocked(pkt->req);
+        }
         memcpy(pkt->getPtr<uint8_t>(),
                pmemAddr + pkt->getAddr() - params()->addrRange.start,
                pkt->getSize());
-        break;
-      case Packet::WriteReq:
-        memcpy(pmemAddr + pkt->getAddr() - params()->addrRange.start,
-               pkt->getPtr<uint8_t>(),
-               pkt->getSize());
-        // temporary hack: will need to add real LL/SC implementation
-        // for cacheless systems later.
-        if (pkt->req->getFlags() & LOCKED) {
-            pkt->req->setScResult(1);
+    }
+    else if (pkt->isWrite()) {
+        if (writeOK(pkt->req)) {
+            memcpy(pmemAddr + pkt->getAddr() - params()->addrRange.start,
+                   pkt->getPtr<uint8_t>(), pkt->getSize());
         }
-        break;
-      default:
+    }
+    else if (pkt->isInvalidate()) {
+        //upgrade or invalidate
+        pkt->flags |= SATISFIED;
+    }
+    else {
         panic("unimplemented");
     }
 
@@ -147,7 +231,7 @@ PhysicalMemory::getPort(const std::string &if_name, int idx)
         port = new MemoryPort(name() + "-port", this);
         return port;
     } else if (if_name == "functional") {
-        /* special port for functional writes at startup. */
+        /* special port for functional writes at startup. And for memtester */
         return new MemoryPort(name() + "-funcport", this);
     } else {
         panic("PhysicalMemory::getPort: unknown port %s requested", if_name);
diff --git a/src/mem/physical.hh b/src/mem/physical.hh
index 02308b2ef..97bea2ec4 100644
--- a/src/mem/physical.hh
+++ b/src/mem/physical.hh
@@ -78,6 +78,68 @@ class PhysicalMemory : public MemObject
     const PhysicalMemory &operator=(const PhysicalMemory &specmem);
 
   protected:
+
+    class LockedAddr {
+      public:
+        // on alpha, minimum LL/SC granularity is 16 bytes, so lower
+        // bits need to masked off.
+        static const Addr Addr_Mask = 0xf;
+
+        static Addr mask(Addr paddr) { return (paddr & ~Addr_Mask); }
+
+        Addr addr; 	// locked address
+        int cpuNum;	// locking CPU
+        int threadNum;	// locking thread ID within CPU
+
+        // check for matching execution context
+        bool matchesContext(Request *req)
+        {
+            return (cpuNum == req->getCpuNum() &&
+                    threadNum == req->getThreadNum());
+        }
+
+        LockedAddr(Request *req)
+            : addr(mask(req->getPaddr())),
+              cpuNum(req->getCpuNum()),
+              threadNum(req->getThreadNum())
+        {
+        }
+    };
+
+    std::list<LockedAddr> lockedAddrList;
+
+    // helper function for checkLockedAddrs(): we really want to
+    // inline a quick check for an empty locked addr list (hopefully
+    // the common case), and do the full list search (if necessary) in
+    // this out-of-line function
+    bool checkLockedAddrList(Request *req);
+
+    // Record the address of a load-locked operation so that we can
+    // clear the execution context's lock flag if a matching store is
+    // performed
+    void trackLoadLocked(Request *req);
+
+    // Compare a store address with any locked addresses so we can
+    // clear the lock flag appropriately.  Return value set to 'false'
+    // if store operation should be suppressed (because it was a
+    // conditional store and the address was no longer locked by the
+    // requesting execution context), 'true' otherwise.  Note that
+    // this method must be called on *all* stores since even
+    // non-conditional stores must clear any matching lock addresses.
+    bool writeOK(Request *req) {
+        if (lockedAddrList.empty()) {
+            // no locked addrs: nothing to check, store_conditional fails
+            bool isLocked = req->isLocked();
+            if (isLocked) {
+                req->setScResult(0);
+            }
+            return !isLocked; // only do write if not an sc
+        } else {
+            // iterate over list...
+            return checkLockedAddrList(req);
+        }
+    }
+
     uint8_t *pmemAddr;
     MemoryPort *port;
     int pagePtr;
diff --git a/src/mem/port.hh b/src/mem/port.hh
index 6b4184043..bb3bc1b1b 100644
--- a/src/mem/port.hh
+++ b/src/mem/port.hh
@@ -106,8 +106,7 @@ class Port
     /** Holds the ports status.  Currently just that a range recomputation needs
      * to be done. */
     enum Status {
-        RangeChange,
-        SnoopSquash
+        RangeChange
     };
 
     void setName(const std::string &name)
diff --git a/src/mem/request.hh b/src/mem/request.hh
index 6acd7526c..e54984fcd 100644
--- a/src/mem/request.hh
+++ b/src/mem/request.hh
@@ -232,9 +232,11 @@ class Request
     Addr getPC() { assert(validPC); return pc; }
 
     /** Accessor Function to Check Cacheability. */
-    bool isUncacheable() { return getFlags() & UNCACHEABLE; }
+    bool isUncacheable() { return (getFlags() & UNCACHEABLE) != 0; }
 
-    bool isInstRead() { return getFlags() & INST_READ; }
+    bool isInstRead() { return (getFlags() & INST_READ) != 0; }
+
+    bool isLocked() { return (getFlags() & LOCKED) != 0; }
 
     friend class Packet;
 };
diff --git a/src/mem/tport.cc b/src/mem/tport.cc
index 55c301c87..456878d0a 100644
--- a/src/mem/tport.cc
+++ b/src/mem/tport.cc
@@ -47,22 +47,28 @@ SimpleTimingPort::recvTiming(Packet *pkt)
     // if we ever added it back.
     assert(pkt->result != Packet::Nacked);
     Tick latency = recvAtomic(pkt);
-    // turn packet around to go back to requester
-    pkt->makeTimingResponse();
-    sendTimingLater(pkt, latency);
+    // turn packet around to go back to requester if response expected
+    if (pkt->needsResponse()) {
+        pkt->makeTimingResponse();
+        sendTimingLater(pkt, latency);
+    }
     return true;
 }
 
 void
 SimpleTimingPort::recvRetry()
 {
-    bool result = true;
-    while (result && transmitList.size()) {
-        result = sendTiming(transmitList.front());
-        if (result)
-            transmitList.pop_front();
+    assert(outTiming > 0);
+    assert(!transmitList.empty());
+    if (sendTiming(transmitList.front())) {
+        transmitList.pop_front();
+        outTiming--;
+        DPRINTF(Bus, "No Longer waiting on retry\n");
+        if (!transmitList.empty())
+            sendTimingLater(transmitList.front(), 1);
     }
-    if (transmitList.size() == 0 && drainEvent) {
+
+    if (transmitList.empty() && drainEvent) {
         drainEvent->process();
         drainEvent = NULL;
     }
@@ -71,18 +77,28 @@ SimpleTimingPort::recvRetry()
 void
 SimpleTimingPort::SendEvent::process()
 {
-    port->outTiming--;
-    assert(port->outTiming >= 0);
-    if (port->sendTiming(packet)) {
-        // send successfule
-        if (port->transmitList.size() == 0 && port->drainEvent) {
+    assert(port->outTiming > 0);
+    if (!port->transmitList.empty() && port->transmitList.front() != packet) {
+        //We are not the head of the list
+        port->transmitList.push_back(packet);
+    } else if (port->sendTiming(packet)) {
+        // send successful
+        if (port->transmitList.size()) {
+            port->transmitList.pop_front();
+            port->outTiming--;
+           if (!port->transmitList.empty())
+                port->sendTimingLater(port->transmitList.front(), 1);
+        }
+        if (port->transmitList.empty() && port->drainEvent) {
             port->drainEvent->process();
             port->drainEvent = NULL;
         }
     } else {
         // send unsuccessful (due to flow control).  Will get retry
-        // callback later; save for then.
-        port->transmitList.push_back(packet);
+        // callback later; save for then if not already
+        DPRINTF(Bus, "Waiting on retry\n");
+        if (!(port->transmitList.front() == packet))
+            port->transmitList.push_back(packet);
     }
 }